This commit is contained in:
louiscklaw
2025-01-31 22:10:02 +08:00
parent 97df42e0d5
commit 2627562070
2852 changed files with 748727 additions and 0 deletions

View File

@@ -0,0 +1,47 @@
trigger: none
pr:
autoCancel: true
drafts: false
branches:
include:
- master
paths:
include:
- neural_compressor
- setup.py
- requirements.txt
- .azure-pipelines/code-scan.yml
- .azure-pipelines/scripts/codeScan
- .azure-pipelines/template/docker-template.yml
pool:
vmImage: "ubuntu-latest"
variables:
CODE_SCAN_LOG_PATH: ".azure-pipelines/scripts/codeScan/scanLog"
stages:
- stage: DocStyleCodeScan
displayName: DocStyle Code Scan
dependsOn: []
jobs:
- job: DocStyle
displayName: DocStyle
steps:
- template: template/code-scan-template.yml
parameters:
codeScanFileName: "pydocstyle"
uploadPath: "pydocstyle.log"
- stage: BanditCodeScan
displayName: Bandit Code Scan
dependsOn: []
jobs:
- job: Bandit
displayName: Bandit
steps:
- template: template/code-scan-template.yml
parameters:
codeScanFileName: "bandit"
uploadPath: "bandit.log"

View File

@@ -0,0 +1,45 @@
#
# Copyright (c) 2022 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG UBUNTU_VER=22.04
FROM ubuntu:${UBUNTU_VER} as devel
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
python3 \
python3-pip \
python3-dev \
python3-distutils \
autoconf \
build-essential \
git \
libgl1-mesa-glx \
libglib2.0-0 \
numactl \
time \
wget \
bc \
vim
RUN ln -sf $(which python3) /usr/bin/python
RUN python -m pip install pip==24.0
RUN python -m pip install --no-cache-dir setuptools
RUN pip list
WORKDIR /

View File

@@ -0,0 +1,38 @@
#
# Copyright (c) 2022 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG UBUNTU_VER=22.04
FROM ubuntu:${UBUNTU_VER} as devel
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
aspell \
aspell-en \
python3 \
python3-pip \
python3-dev \
python3-distutils \
wget
RUN ln -sf $(which python3) /usr/bin/python
RUN python -m pip install --no-cache-dir \
bandit\
pyspelling\
pydocstyle
WORKDIR /

View File

@@ -0,0 +1,119 @@
trigger: none
pr:
autoCancel: true
drafts: false
branches:
include:
- master
paths:
include:
- neural_compressor/common
- neural_compressor/torch
- examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only
- setup.py
- requirements_pt.txt
- .azure-pipelines/scripts/models
- .azure-pipelines/model-test-3x.yml
- .azure-pipelines/template/docker-template.yml
variables:
OUT_SCRIPT_PATH: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models
SCRIPT_PATH: /neural-compressor/.azure-pipelines/scripts
parameters:
- name: PyTorch_Model_3X
displayName: Run PyTorch models?
type: boolean
default: true
- name: PyTorchModelList
type: object
default:
- opt_125m_woq_gptq_int4
- opt_125m_woq_gptq_nf4_dq_bnb
- opt_125m_woq_gptq_int4_dq_ggml
stages:
- stage: PyTorchModels
displayName: Run PyTorch Model
pool: ICX-16C
dependsOn: []
condition: and(succeeded(), eq('${{ parameters.PyTorch_Model_3X }}', 'true'))
jobs:
- ${{ each model in parameters.PyTorchModelList }}:
- job:
displayName: ${{ model }}
steps:
- template: template/model-template.yml
parameters:
modelName: ${{ model }}
framework: "pytorch"
APIVersion: "3x"
- stage: GenerateLogs
displayName: Generate Report
pool:
vmImage: "ubuntu-latest"
dependsOn: [PyTorchModels]
jobs:
- job: GenerateReport
steps:
- script: |
echo ${BUILD_SOURCESDIRECTORY}
rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
echo y | docker system prune --all
displayName: "Clean workspace"
- checkout: self
clean: true
displayName: "Checkout out Repo"
- task: DownloadPipelineArtifact@2
inputs:
artifact:
patterns: "**/*_summary.log"
path: $(OUT_SCRIPT_PATH)
- task: DownloadPipelineArtifact@2
inputs:
artifact:
patterns: "**/*_tuning_info.log"
path: $(OUT_SCRIPT_PATH)
- task: UsePythonVersion@0
displayName: "Use Python 3.10"
inputs:
versionSpec: "3.10"
- script: |
cd ${OUT_SCRIPT_PATH}
mkdir generated
mkdir last_generated
pip install requests
python -u collect_log_all.py --logs_dir $(OUT_SCRIPT_PATH) --output_dir generated --build_id=$(Build.BuildId)
displayName: "Collect all logs"
- task: DownloadPipelineArtifact@2
continueOnError: true
inputs:
source: "specific"
artifact: "FinalReport"
patterns: "**.log"
path: $(OUT_SCRIPT_PATH)/last_generated
project: $(System.TeamProject)
pipeline: "Model-Test"
runVersion: "specific"
runId: $(refer_buildId)
displayName: "Download last logs"
- script: |
echo "------ Generating final report.html ------"
cd ${OUT_SCRIPT_PATH}
/usr/bin/bash generate_report.sh --WORKSPACE generated --output_dir generated --last_logt_dir last_generated
displayName: "Generate report"
- task: PublishPipelineArtifact@1
inputs:
targetPath: $(OUT_SCRIPT_PATH)/generated
artifact: FinalReport
publishLocation: "pipeline"
displayName: "Publish report"
- script: |
if [ $(is_perf_reg) == 'true' ]; then
echo "Some benchmark regression occurred or the reference data need to be updated, please check artifacts and reports."
exit 1
fi
displayName: "Specify regression"

View File

@@ -0,0 +1,173 @@
trigger: none
pr:
autoCancel: true
drafts: false
branches:
include:
- master
paths:
include:
- neural_compressor
- setup.py
- requirements.txt
- .azure-pipelines/model-test.yml
- .azure-pipelines/template/docker-template.yml
- .azure-pipelines/scripts/models
- examples/tensorflow/oob_models/quantization/ptq
- .azure-pipelines/model-test.yml
- .azure-pipelines/scripts/fwk_version.sh
- .azure-pipelines/scripts/install_nc.sh
exclude:
- test
- neural_compressor/common
- neural_compressor/torch
- neural_compressor/tensorflow
- neural_compressor/onnxrt
pool: MODEL_PERF_TEST_TF
variables:
OUT_SCRIPT_PATH: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models
SCRIPT_PATH: /neural-compressor/.azure-pipelines/scripts
parameters:
- name: TensorFlow_Model
displayName: Run TensorFlow models?
type: boolean
default: true
- name: PyTorch_Model
displayName: Run PyTorch models?
type: boolean
default: true
- name: ONNX_Model
displayName: Run ONNX models?
type: boolean
default: true
- name: TensorFlowModelList
type: object
default:
- resnet50v1.5
- ssd_resnet50_v1
- name: PyTorchModelList
type: object
default:
- resnet18_fx
- name: ONNXModelList
type: object
default:
- resnet50-v1-12
stages:
- stage: TensorFlowModels
displayName: Run TensorFlow Model
pool: MODEL_PERF_TEST
dependsOn: []
condition: and(succeeded(), eq('${{ parameters.TensorFlow_Model }}', 'true'))
jobs:
- ${{ each model in parameters.TensorFlowModelList }}:
- job:
displayName: ${{ model }}
steps:
- template: template/model-template.yml
parameters:
modelName: ${{ model }}
framework: "tensorflow"
- stage: PyTorchModels
displayName: Run PyTorch Model
pool: MODEL_PERF_TEST
dependsOn: []
condition: and(succeeded(), eq('${{ parameters.PyTorch_Model }}', 'true'))
jobs:
- ${{ each model in parameters.PyTorchModelList }}:
- job:
displayName: ${{ model }}
steps:
- template: template/model-template.yml
parameters:
modelName: ${{ model }}
framework: "pytorch"
- stage: ONNXModels
displayName: Run ONNX Model
pool: MODEL_PERF_TEST
dependsOn: []
condition: and(succeeded(), eq('${{ parameters.ONNX_Model }}', 'true'))
jobs:
- ${{ each model in parameters.ONNXModelList }}:
- job:
displayName: ${{ model }}
steps:
- template: template/model-template.yml
parameters:
modelName: ${{ model }}
framework: "onnxrt"
- stage: GenerateLogs
displayName: Generate Report
pool:
vmImage: "ubuntu-latest"
dependsOn: [TensorFlowModels, PyTorchModels, ONNXModels]
jobs:
- job: GenerateReport
steps:
- script: |
echo ${BUILD_SOURCESDIRECTORY}
rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
echo y | docker system prune --all
displayName: "Clean workspace"
- checkout: self
clean: true
displayName: "Checkout out Repo"
- task: DownloadPipelineArtifact@2
inputs:
artifact:
patterns: "**/*_summary.log"
path: $(OUT_SCRIPT_PATH)
- task: DownloadPipelineArtifact@2
inputs:
artifact:
patterns: "**/*_tuning_info.log"
path: $(OUT_SCRIPT_PATH)
- task: UsePythonVersion@0
displayName: "Use Python 3.10"
inputs:
versionSpec: "3.10"
- script: |
cd ${OUT_SCRIPT_PATH}
mkdir generated
mkdir last_generated
pip install requests
python -u collect_log_all.py --logs_dir $(OUT_SCRIPT_PATH) --output_dir generated --build_id=$(Build.BuildId)
displayName: "Collect all logs"
- task: DownloadPipelineArtifact@2
continueOnError: true
inputs:
source: "specific"
artifact: "FinalReport"
patterns: "**.log"
path: $(OUT_SCRIPT_PATH)/last_generated
project: $(System.TeamProject)
pipeline: "Model-Test"
runVersion: "specific"
runId: $(refer_buildId)
displayName: "Download last logs"
- script: |
echo "------ Generating final report.html ------"
cd ${OUT_SCRIPT_PATH}
/usr/bin/bash generate_report.sh --WORKSPACE generated --output_dir generated --last_logt_dir last_generated
displayName: "Generate report"
- task: PublishPipelineArtifact@1
inputs:
targetPath: $(OUT_SCRIPT_PATH)/generated
artifact: FinalReport
publishLocation: "pipeline"
displayName: "Publish report"
- script: |
if [ $(is_perf_reg) == 'true' ]; then
echo "Some benchmark regression occurred or the reference data need to be updated, please check artifacts and reports."
exit 1
fi
displayName: "Specify regression"

View File

@@ -0,0 +1,81 @@
#!/bin/bash
# -------------- general approach start----------------
# 1. import this file:
# source path/change_color.sh
# 2. use COLOR/BG:
# $VARIABLE_NAME && out_put_content && $RESET
# 3. COLOR + BG:
# $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET
# 4. custom
# abbreviation(change number)
# txt number range (30, 37)
# bg number range (40, 47)
# special effects number range (1, 7)
# echo -en \\E[number1 + ; + number2 + ; + number3 + m"
# e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m"
# -------------- general approach end----------------==
# general setting
# ------------- light_color start----------------
# black
LIGHT_BLACK="echo -en \\E[30m"
# red
LIGHT_RED="echo -en \\E[31m"
# green
LIGHT_GREEN="echo -en \\E[32m"
# yellow
LIGHT_YELLOW="echo -en \\E[33m"
# blue
LIGHT_BLUE="echo -en \\E[34m"
# purple
LIGHT_PURPLE="echo -en \\E[35m"
# cyan
LIGHT_CYAN="echo -en \\E[36m"
# gray
LIGHT_GRAY="echo -en \\E[37m"
# ------------- light_color end----------------
# ------------- bold_color start----------------
# black
BOLD_BLACK="echo -en \\E[1;30m"
# red
BOLD_RED="echo -en \\E[1;31m"
# green
BOLD_GREEN="echo -en \\E[1;32m"
# yellow
BOLD_YELLOW="echo -en \\E[1;33m"
# blue
BOLD_BLUE="echo -en \\E[1;34m"
# purple
BOLD_PURPLE="echo -en \\E[1;35m"
# cyan
BOLD_CYAN="echo -en \\E[1;36m"
# gray
BOLD_GRAY="echo -en \\E[1;37m"
# ------------- bold_color end----------------
# ------------- background_color start----------------
# black
BG_BLACK="echo -en \\E[40m"
# red
BG_RED="echo -en \\E[41m"
# green
BG_GREEN="echo -en \\E[42m"
# yellow
BG_YELLOW="echo -en \\E[43m"
# blue
BG_BLUE="echo -en \\E[44m"
# purple
BG_PURPLE="echo -en \\E[45m"
# cyan
BG_CYAN="echo -en \\E[46m"
# gray
BG_GRAY="echo -en \\E[47m"
# ------------- background_color end----------------
# close
RESET="echo -en \\E[0m"

View File

@@ -0,0 +1,34 @@
#!/bin/bash
for var in "$@"
do
case $var in
--scan_module=*)
scan_module=$(echo $var |cut -f2 -d=)
;;
esac
done
source /neural-compressor/.azure-pipelines/scripts/change_color.sh
RESET="echo -en \\E[0m \\n" # close color
log_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/scanLog"
mkdir -p $log_dir
python -m bandit -r -lll -iii "/neural-compressor/${scan_module}" >$log_dir/bandit.log
exit_code=$?
$BOLD_YELLOW && echo " ----------------- Current bandit cmd start --------------------------" && $RESET
echo "python -m bandit -r -lll -iii /neural-compressor/${scan_module} > $log_dir/bandit.log"
$BOLD_YELLOW && echo " ----------------- Current bandit cmd end --------------------------" && $RESET
$BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------"
cat $log_dir/bandit.log
$BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET
if [ ${exit_code} -ne 0 ]; then
$BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Bandit error details." && $RESET
exit 1
fi
$BOLD_PURPLE && echo "Congratulations, Bandit check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
exit 0

View File

@@ -0,0 +1,15 @@
activ
ans
assertin
datas
ende
lates
masia
mutli
nd
ot
rouge
te
tne
ue
womens

View File

@@ -0,0 +1,43 @@
#!/bin/bash
for var in "$@"
do
case $var in
--scan_module=*)
scan_module=$(echo $var |cut -f2 -d=)
;;
esac
done
source /neural-compressor/.azure-pipelines/scripts/change_color.sh
RESET="echo -en \\E[0m \\n" # close color
work_dir="/neural-compressor/.azure-pipelines/scripts/codeScan/pydocstyle"
log_dir="$work_dir/../scanLog"
mkdir -p $log_dir
scan_path="scan_path.txt"
exit_code=0
for line in $(cat ${work_dir}/${scan_path})
do
pydocstyle --convention=google $line >> $log_dir/pydocstyle.log
if [ $? -ne 0 ]; then
exit_code=1
fi
done
$BOLD_YELLOW && echo " ----------------- Current pydocstyle cmd start --------------------------" && $RESET
echo "pydocstyle --convention=google \$line > $log_dir/pydocstyle.log"
$BOLD_YELLOW && echo " ----------------- Current pydocstyle cmd end --------------------------" && $RESET
$BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------"
cat $log_dir/pydocstyle.log
$BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET
if [ ${exit_code} -ne 0 ]; then
$BOLD_RED && echo "Error!! Please Click on the artifact button to download and view DocStyle error details." && $RESET
exit 1
fi
$BOLD_PURPLE && echo "Congratulations, DocStyle check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
exit 0

View File

@@ -0,0 +1,27 @@
/neural-compressor/neural_compressor/adaptor/mxnet_utils
/neural-compressor/neural_compressor/adaptor/ox_utils
/neural-compressor/neural_compressor/adaptor/tensorflow.py
/neural-compressor/neural_compressor/adaptor/tf_utils
/neural-compressor/neural_compressor/algorithm
/neural-compressor/neural_compressor/benchmark.py
/neural-compressor/neural_compressor/config.py
/neural-compressor/neural_compressor/contrib
/neural-compressor/neural_compressor/experimental
/neural-compressor/neural_compressor/mix_precision.py
/neural-compressor/neural_compressor/model
/neural-compressor/neural_compressor/objective.py
/neural-compressor/neural_compressor/pruner
/neural-compressor/neural_compressor/quantization.py
/neural-compressor/neural_compressor/strategy
/neural-compressor/neural_compressor/training.py
/neural-compressor/neural_compressor/utils
/neural-compressor/neural_compressor/common
/neural-compressor/neural_compressor/tensorflow
/neural-compressor/neural_compressor/torch/algorithms/layer_wise
/neural-compressor/neural_compressor/torch/algorithms/mixed_precision
/neural-compressor/neural_compressor/torch/algorithms/mx_quant
/neural-compressor/neural_compressor/torch/algorithms/pt2e_quant
/neural-compressor/neural_compressor/torch/algorithms/smooth_quant
/neural-compressor/neural_compressor/torch/algorithms/static_quant
/neural-compressor/neural_compressor/torch/algorithms/weight_only
/neural-compressor/neural_compressor/torch/export

View File

@@ -0,0 +1,10 @@
#!/bin/bash
echo "export FWs version..."
export tensorflow_version='2.15.0-official'
export pytorch_version='2.5.1+cpu'
export torchvision_version='0.20.1'
export ipex_version='2.5.0+cpu'
export onnx_version='1.17.0'
export onnxruntime_version='1.20.0'
export mxnet_version='1.9.1'

View File

@@ -0,0 +1,31 @@
#!/bin/bash
echo -e "##[group]Install Neural Compressor ... "
cd /neural-compressor
if [[ $1 = *"3x_pt"* ]]; then
python -m pip install --no-cache-dir -r requirements_pt.txt
if [[ $1 = *"3x_pt_fp8"* ]]; then
pip uninstall neural_compressor_3x_pt -y || true
python setup.py pt bdist_wheel
else
echo -e "\n Install torch CPU ... "
pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
python -m pip install intel-extension-for-pytorch==2.5.0 oneccl_bind_pt==2.5.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
python -m pip install --no-cache-dir -r requirements.txt
python setup.py bdist_wheel
fi
pip install --no-deps dist/neural_compressor*.whl --force-reinstall
elif [[ $1 = *"3x_tf"* ]]; then
python -m pip install --no-cache-dir -r requirements.txt
python -m pip install --no-cache-dir -r requirements_tf.txt
python setup.py bdist_wheel
pip install dist/neural_compressor*.whl --force-reinstall
else
python -m pip install --no-cache-dir -r requirements.txt
python setup.py bdist_wheel
pip install dist/neural_compressor*.whl --force-reinstall
fi
echo -e "\n pip list after install Neural Compressor ... "
echo "##[endgroup]"
pip list

View File

@@ -0,0 +1,79 @@
import argparse
import os
import requests
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument("--logs_dir", type=str, default=".")
parser.add_argument("--output_dir", type=str, default=".")
parser.add_argument("--build_id", type=str, default="0")
args = parser.parse_args()
print(args)
def main():
file_dir = args.logs_dir
summary_content = ["OS;Platform;Framework;Version;Precision;Model;Mode;Type;BS;Value;Url\n"]
tuning_info_content = ["OS;Platform;Framework;Version;Model;Strategy;Tune_time\n"]
url_dict = parse_download_url()
# get full path of all files
for root, dirs, files in os.walk(file_dir):
for name in files:
file_name = os.path.join(root, name)
print(file_name)
if "_summary.log" in name:
for line in open(file_name, "r"):
if "linux" in line:
line = line.replace("<url>", parse_summary_log(line, url_dict))
summary_content.append(line)
if "_tuning_info.log" in name:
for line in open(file_name, "r"):
if "linux" in line:
line = line.replace("<url>", parse_tuning_log(line, url_dict))
tuning_info_content.append(line)
f = open(args.output_dir + "/summary.log", "a")
for summary in summary_content:
f.writelines(str(summary))
f2 = open(args.output_dir + "/tuning_info.log", "a")
for tuning_info in tuning_info_content:
f2.writelines(str(tuning_info))
def parse_tuning_log(line, url_dict):
"""Parsing {Framework}-{Model}-tune.log to get tuning result."""
result = line.split(";")
OS, Platform, Framework, Version, Model, Strategy, Tune_time, Tuning_trials, URL, __ = result
file_name = f"{Framework}-{Model}-tune.log"
download_url = url_dict.get(f"{Framework}_{Model}")
download_url = f"{download_url}{file_name}"
return download_url
def parse_summary_log(line, url_dict):
"""Parse {Framework}-{Model}-tune.log to get benchmarking accuracy result."""
result = line.split(";")
OS, Platform, Framework, Version, Precision, Model, Mode, Type, BS, Value, Url = result
file_name = f"{Framework}-{Model}-tune.log"
download_url = url_dict.get(f"{Framework}_{Model}")
download_url = f"{download_url}{file_name}"
return download_url
def parse_download_url():
"""Get azure artifact information."""
azure_artifact_api_url = (
f"https://dev.azure.com/lpot-inc/neural-compressor/_apis/build/builds/{args.build_id}/artifacts?api-version=5.1"
)
azure_artifacts_data = dict(requests.get(azure_artifact_api_url).json().items())
artifact_count = azure_artifacts_data.get("count")
artifact_value = azure_artifacts_data.get("value")
url_dict = {}
for item in artifact_value:
artifact_download_url = item.get("resource").get("downloadUrl")
artifact_download_url = f"{artifact_download_url[:-3]}file&subPath=%2F"
url_dict[item.get("name")] = artifact_download_url
return url_dict
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,309 @@
import argparse
import os
import re
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument("--framework", type=str, required=True)
parser.add_argument("--fwk_ver", type=str, required=True)
parser.add_argument("--model", type=str, required=True)
parser.add_argument("--logs_dir", type=str, default=".")
parser.add_argument("--output_dir", type=str, default=".")
parser.add_argument("--build_id", type=str, default="0")
parser.add_argument("--stage", type=str, default="collect_log")
parser.add_argument("--gap", type=float, default=0.05)
parser.add_argument("--inc_new_api", type=str, default="")
args = parser.parse_args()
print("====== collecting model test log =======")
OS = "linux"
PLATFORM = "icx"
URL = (
"https://dev.azure.com/lpot-inc/neural-compressor/_build/results?buildId="
+ args.build_id
+ "&view=artifacts&pathAsName=false&type=publishedArtifacts"
)
OOB_MODEL_LIST = ["darknet19", "densenet-121", "resnet-101"]
def get_model_tuning_dict_results():
tuning_result_dict = {}
if os.path.exists(tuning_log):
print("tuning log found")
tmp = {"fp32_acc": 0, "int8_acc": 0, "tuning_trials": 0}
with open(tuning_log, "r") as f:
for line in f:
parse_tuning_line(line, tmp)
print(tmp)
tuning_result_dict = {
"OS": OS,
"Platform": PLATFORM,
"Framework": args.framework,
"Version": args.fwk_ver,
"Model": args.model,
"Strategy": tmp.get("strategy", "basic"),
"Tune_time": tmp.get("tune_time"),
}
benchmark_accuracy_result_dict = {
"int8": {
"OS": OS,
"Platform": PLATFORM,
"Framework": args.framework,
"Version": args.fwk_ver,
"Model": args.model,
"Mode": "Inference",
"Type": "Accuracy",
"BS": 1,
"Value": tmp.get("int8_acc"),
"Url": URL,
},
"fp32": {
"OS": OS,
"Platform": PLATFORM,
"Framework": args.framework,
"Version": args.fwk_ver,
"Model": args.model,
"Mode": "Inference",
"Type": "Accuracy",
"BS": 1,
"Value": tmp.get("fp32_acc"),
"Url": URL,
},
}
return tuning_result_dict, benchmark_accuracy_result_dict
else:
return {}, {}
def get_model_benchmark_dict_results():
benchmark_performance_result_dict = {"int8": {}, "fp32": {}}
for precision in ["int8", "fp32"]:
throughput = 0.0
bs = 1
for root, dirs, files in os.walk(args.logs_dir):
for name in files:
file_name = os.path.join(root, name)
if "performance-" + precision in name:
for line in open(file_name, "r"):
result = parse_perf_line(line)
if result.get("throughput"):
throughput += result.get("throughput")
if result.get("batch_size"):
bs = result.get("batch_size")
benchmark_performance_result_dict[precision] = {
"OS": OS,
"Platform": PLATFORM,
"Framework": args.framework,
"Version": args.fwk_ver,
"Model": args.model,
"Mode": "Inference",
"Type": "Performance",
"BS": 1,
"Value": throughput,
"Url": URL,
}
return benchmark_performance_result_dict
def get_refer_data():
refer_log = os.path.join(f"{args.logs_dir}_refer_log", f"{args.framework}_{args.model}_summary.log")
result = {}
if os.path.exists(refer_log):
with open(refer_log, "r") as f:
lines = f.readlines()
keys = lines[0].split(";")
values = [lines[i].split(";") for i in range(1, len(lines))]
for value in values:
precision = value[keys.index("Precision")]
Type = value[keys.index("Type")]
result[f"{precision}_{Type}"] = (
float(value[keys.index("Value")]) if value[keys.index("Value")] != "unknown" else "unknown"
)
return result
else:
print(f"refer log file: {refer_log} not found")
return 0
def collect_log():
results = []
tuning_infos = []
print(f"quantization log dir is {tuning_log}")
# get model tuning results
if os.path.exists(tuning_log):
print("quantization log found")
tmp = {"fp32_acc": 0, "int8_acc": 0, "tuning_trials": 0}
with open(tuning_log, "r") as f:
for line in f:
parse_tuning_line(line, tmp)
print(tmp)
# oob_model no need acc
if (args.model in OOB_MODEL_LIST) and args.framework == "tensorflow":
tmp["fp32_acc"], tmp["int8_acc"] = "unknown", "unknown"
# set for 3x woq models
if args.inc_new_api.split("_")[0] == "3x":
tmp["fp32_acc"], tmp["tuning_trials"], tmp["strategy"] = "unknown", "", ""
if "acc_bs" in tmp:
acc_bs = tmp["acc_bs"]
else:
acc_bs = 1
results.append(
"{};{};{};{};FP32;{};Inference;Accuracy;{};{};{}\n".format(
OS, PLATFORM, args.framework, args.fwk_ver, args.model, acc_bs, tmp["fp32_acc"], "<url>"
)
)
results.append(
"{};{};{};{};INT8;{};Inference;Accuracy;{};{};{}\n".format(
OS, PLATFORM, args.framework, args.fwk_ver, args.model, acc_bs, tmp["int8_acc"], "<url>"
)
)
tuning_infos.append(
";".join(
[
OS,
PLATFORM,
args.framework,
args.fwk_ver,
args.model,
tmp.get("strategy", "basic"),
str(tmp["tune_time"]),
str(tmp["tuning_trials"]),
"<url>",
f"{round(tmp['max_mem_size'] / tmp['total_mem_size'] * 100, 4)}%",
]
)
+ "\n"
)
# get model benchmark results
if args.inc_new_api.split("_")[0] != "3x":
for precision in ["int8", "fp32"]:
throughput = 0.0
bs = 1
for root, dirs, files in os.walk(args.logs_dir):
for name in files:
file_name = os.path.join(root, name)
print(file_name)
if "performance-" + precision in name:
for line in open(file_name, "r"):
result = parse_perf_line(line)
if result.get("throughput"):
throughput += result.get("throughput")
if result.get("batch_size"):
bs = result.get("batch_size")
results.append(
"{};{};{};{};{};{};Inference;Performance;{};{};{}\n".format(
OS, PLATFORM, args.framework, args.fwk_ver, precision.upper(), args.model, bs, throughput, URL
)
)
# write model logs
f = open(args.output_dir + "/" + args.framework + "_" + args.model + "_summary.log", "a")
f.writelines("OS;Platform;Framework;Version;Precision;Model;Mode;Type;BS;Value;Url\n")
for result in results:
f.writelines(str(result))
f2 = open(args.output_dir + "/" + args.framework + "_" + args.model + "_tuning_info.log", "a")
f2.writelines("OS;Platform;Framework;Version;Model;Strategy;Tune_time\n")
for tuning_info in tuning_infos:
f2.writelines(str(tuning_info))
def parse_tuning_line(line, tmp):
tuning_strategy = re.search(r"Tuning strategy:\s+([A-Za-z]+)", line)
if tuning_strategy and tuning_strategy.group(1):
tmp["strategy"] = tuning_strategy.group(1)
baseline_acc = re.search(
r"FP32 baseline is:\s+\[Accuracy:\s(\d+(\.\d+)?), Duration \(seconds\):\s*(\d+(\.\d+)?)\]", line
)
if baseline_acc and baseline_acc.group(1):
tmp["fp32_acc"] = float(baseline_acc.group(1))
tuned_acc = re.search(
r"Best tune result is:\s+\[Accuracy:\s(\d+(\.\d+)?), Duration \(seconds\):\s(\d+(\.\d+)?)\]", line
)
if tuned_acc and tuned_acc.group(1):
tmp["int8_acc"] = float(tuned_acc.group(1))
if args.inc_new_api.split("_")[0] == "3x":
quant_acc = re.search(r"Accuracy:\s+(\d+(\.\d+)?)", line)
if quant_acc and quant_acc.group(1):
tmp["int8_acc"] = float(quant_acc.group(1))
batch_size = re.search(r"Batch size = ([0-9]+)", line)
if batch_size and batch_size.group(1):
tmp["acc_bs"] = int(batch_size.group(1))
tune_trial = re.search(r"Tune \d*\s*result is:", line)
if tune_trial:
tmp["tuning_trials"] += 1
tune_time = re.search(r"Tuning time spend:\s+(\d+(\.\d+)?)s", line)
if tune_time and tune_time.group(1):
tmp["tune_time"] = int(tune_time.group(1))
fp32_model_size = re.search(r"The input model size is:\s+(\d+(\.\d+)?)", line)
if fp32_model_size and fp32_model_size.group(1):
tmp["fp32_model_size"] = int(fp32_model_size.group(1))
int8_model_size = re.search(r"The output model size is:\s+(\d+(\.\d+)?)", line)
if int8_model_size and int8_model_size.group(1):
tmp["int8_model_size"] = int(int8_model_size.group(1))
total_mem_size = re.search(r"Total resident size\D*([0-9]+)", line)
if total_mem_size and total_mem_size.group(1):
tmp["total_mem_size"] = float(total_mem_size.group(1))
max_mem_size = re.search(r"Maximum resident set size\D*([0-9]+)", line)
if max_mem_size and max_mem_size.group(1):
tmp["max_mem_size"] = float(max_mem_size.group(1))
def parse_perf_line(line):
perf_data = {}
throughput = re.search(r"Throughput:\s+(\d+(\.\d+)?)", line)
if throughput and throughput.group(1):
perf_data.update({"throughput": float(throughput.group(1))})
batch_size = re.search(r"Batch size = ([0-9]+)", line)
if batch_size and batch_size.group(1):
perf_data.update({"batch_size": int(batch_size.group(1))})
return perf_data
def check_status(precision, precision_upper, check_accuracy=False):
performance_result = get_model_benchmark_dict_results()
current_performance = performance_result.get(precision).get("Value")
refer_performance = refer.get(f"{precision_upper}_Performance")
print(f"current_performance_data = {current_performance:.3f}, refer_performance_data = {refer_performance:.3f}")
assert (refer_performance - current_performance) / refer_performance <= args.gap
if check_accuracy:
_, accuracy_result = get_model_tuning_dict_results()
current_accuracy = accuracy_result.get(precision).get("Value")
refer_accuracy = refer.get(f"{precision_upper}_Accuracy")
print(f"current_accuracy_data = {current_accuracy:.3f}, refer_accuarcy_data = {refer_accuracy:.3f}")
assert abs(current_accuracy - refer_accuracy) <= 0.001
if __name__ == "__main__":
tuning_log = os.path.join(args.logs_dir, f"{args.framework}-{args.model}-tune.log")
refer = get_refer_data()
if args.stage == "collect_log":
collect_log()
elif args.stage == "int8_benchmark" and refer:
check_status("int8", "INT8")
elif args.stage == "fp32_benchmark" and refer:
check_status("fp32", "FP32")
elif not refer:
print("skip check status")
else:
raise ValueError(f"{args.stage} does not exist")

View File

@@ -0,0 +1,147 @@
#!/bin/bash
set -eo pipefail
source /neural-compressor/.azure-pipelines/scripts/change_color.sh
# get parameters
PATTERN='[-a-zA-Z0-9_]*='
for i in "$@"; do
case $i in
--yaml=*)
yaml=$(echo $i | sed "s/${PATTERN}//")
;;
--framework=*)
framework=$(echo $i | sed "s/${PATTERN}//")
;;
--fwk_ver=*)
fwk_ver=$(echo $i | sed "s/${PATTERN}//")
;;
--torch_vision_ver=*)
torch_vision_ver=$(echo $i | sed "s/${PATTERN}//")
;;
--model=*)
model=$(echo $i | sed "s/${PATTERN}//")
;;
--model_src_dir=*)
model_src_dir=$(echo $i | sed "s/${PATTERN}//")
;;
--dataset_location=*)
dataset_location=$(echo $i | sed "s/${PATTERN}//")
;;
--batch_size=*)
batch_size=$(echo $i | sed "s/${PATTERN}//")
;;
--strategy=*)
strategy=$(echo $i | sed "s/${PATTERN}//")
;;
--new_benchmark=*)
new_benchmark=$(echo $i | sed "s/${PATTERN}//")
;;
--inc_new_api=*)
inc_new_api=$(echo $i | sed "s/${PATTERN}//")
;;
*)
echo "Parameter $i not recognized."
exit 1
;;
esac
done
SCRIPTS_PATH="/neural-compressor/.azure-pipelines/scripts/models"
log_dir="/neural-compressor/.azure-pipelines/scripts/models"
if [[ "${inc_new_api}" == "3x"* ]]; then
WORK_SOURCE_DIR="/neural-compressor/examples/3.x_api/${framework}"
git clone https://github.com/intel/intel-extension-for-transformers.git /itrex
cd /itrex
pip install -r requirements.txt
pip install -v .
else
WORK_SOURCE_DIR="/neural-compressor/examples/${framework}"
fi
$BOLD_YELLOW && echo "processing ${framework}-${fwk_ver}-${model}" && $RESET
$BOLD_YELLOW && echo "======= creat log_dir =========" && $RESET
if [ -d "${log_dir}/${model}" ]; then
$BOLD_GREEN && echo "${log_dir}/${model} already exists, don't need to mkdir." && $RESET
else
$BOLD_GREEN && echo "no log dir ${log_dir}/${model}, create." && $RESET
cd ${log_dir}
mkdir ${model}
fi
$BOLD_YELLOW && echo "====== install requirements ======" && $RESET
/bin/bash /neural-compressor/.azure-pipelines/scripts/install_nc.sh ${inc_new_api}
mkdir -p ${WORK_SOURCE_DIR}
cd ${WORK_SOURCE_DIR}
if [[ "${inc_new_api}" == "false" ]]; then
echo "copy old api examples to workspace..."
git clone -b old_api_examples https://github.com/intel/neural-compressor.git old-lpot-models
cd old-lpot-models
git branch
cd -
rm -rf ${model_src_dir}
mkdir -p ${model_src_dir}
cp -r old-lpot-models/examples/${framework}/${model_src_dir} ${WORK_SOURCE_DIR}/${model_src_dir}/../
fi
cd ${model_src_dir}
if [[ "${fwk_ver}" != "latest" ]]; then
pip install ruamel.yaml==0.17.40
pip install psutil
pip install protobuf==4.23.4
if [[ "${framework}" == "tensorflow" ]]; then
if [[ "${fwk_ver}" == *"-official" ]]; then
pip install tensorflow==${fwk_ver%-official}
else
pip install intel-tensorflow==${fwk_ver}
fi
elif [[ "${framework}" == "pytorch" ]]; then
pip install torch==${fwk_ver} --index-url https://download.pytorch.org/whl/cpu
pip install torchvision==${torch_vision_ver} --index-url https://download.pytorch.org/whl/cpu
elif [[ "${framework}" == "onnxrt" ]]; then
pip install onnx==1.15.0
pip install onnxruntime==${fwk_ver}
fi
fi
if [ -f "requirements.txt" ]; then
sed -i '/neural-compressor/d' requirements.txt
if [ "${framework}" == "onnxrt" ]; then
sed -i '/^onnx>=/d;/^onnx==/d;/^onnxruntime>=/d;/^onnxruntime==/d' requirements.txt
fi
if [ "${framework}" == "tensorflow" ]; then
sed -i '/tensorflow==/d;/tensorflow$/d' requirements.txt
sed -i '/^intel-tensorflow/d' requirements.txt
fi
if [ "${framework}" == "pytorch" ]; then
sed -i '/torch==/d;/torch$/d;/torchvision==/d;/torchvision$/d' requirements.txt
fi
n=0
until [ "$n" -ge 5 ]; do
python -m pip install -r requirements.txt && break
n=$((n + 1))
sleep 5
done
pip list
else
$BOLD_RED && echo "Not found requirements.txt file." && $RESET
fi
if [[ "${inc_new_api}" == "false" ]]; then
$BOLD_YELLOW && echo "======== update yaml config ========" && $RESET
$BOLD_YELLOW && echo -e "\nPrint origin yaml..." && $RESET
cat ${yaml}
python ${SCRIPTS_PATH}/update_yaml_config.py \
--yaml=${yaml} \
--framework=${framework} \
--dataset_location=${dataset_location} \
--batch_size=${batch_size} \
--strategy=${strategy} \
--new_benchmark=${new_benchmark} \
--multi_instance='true'
$BOLD_YELLOW && echo -e "\nPrint updated yaml... " && $RESET
cat ${yaml}
fi

View File

@@ -0,0 +1,625 @@
#!/bin/bash
# WORKSPACE=.
# summaryLog=summary.log
# summaryLogLast=summary.log
# tuneLog=tuning_info.log
# tuneLogLast=tuning_info.log
# overview_log=summary_overview.log
# coverage_summary=coverage_summary.log
# nc_code_lines_summary=nc_code_lines_summary.csv
# engine_code_lines_summary=engine_code_lines_summary.csv
#lines_coverage_threshold=80
#branches_coverage_threshold=75
#
#pass_status="<td style=\"background-color:#90EE90\">Pass</td>"
#fail_status="<td style=\"background-color:#FFD2D2\">Fail</td>"
#verify_status="<td style=\"background-color:#f2ea0a\">Verify</td>"
# shellcheck disable=SC2120
while [[ $# -gt 0 ]];do
key=${1}
case ${key} in
-w|--WORKSPACE)
WORKSPACE=${2}
shift 2
;;
--script_path)
script_path=${2}
shift 2
;;
--output_dir)
output_dir=${2}
shift 2
;;
--last_logt_dir)
last_logt_dir=${2}
shift 2
;;
*)
shift
;;
esac
done
echo "workspace: ${WORKSPACE}"
echo "script_path: ${script_path}"
summaryLog="${WORKSPACE}/summary.log"
tuneLog="${WORKSPACE}/tuning_info.log"
echo "summaryLog: ${summaryLog}"
echo "tuneLog: ${tuneLog}"
echo "last_logt_dir: ${last_logt_dir}"
summaryLogLast="${last_logt_dir}/summary.log"
tuneLogLast="${last_logt_dir}/tuning_info.log"
echo "summaryLogLast: ${summaryLogLast}"
echo "tuneLogLast: ${tuneLogLast}"
ghprbPullId=${SYSTEM_PULLREQUEST_PULLREQUESTNUMBER}
MR_source_branch=${SYSTEM_PULLREQUEST_SOURCEBRANCH}
MR_source_repo=${SYSTEM_PULLREQUEST_SOURCEREPOSITORYURI}
MR_target_branch=${SYSTEM_PULLREQUEST_TARGETBRANCH}
repo_url=${BUILD_REPOSITORY_URI}
source_commit_id=${BUILD_SOURCEVERSION}
build_id=${BUILD_BUILDID}
echo "MR_source_branch: ${MR_source_branch}"
echo "MR_source_repo: ${MR_source_repo}"
echo "MR_target_branch: ${MR_target_branch}"
echo "repo_url: ${repo_url}"
echo "commit_id: ${source_commit_id}"
echo "ghprbPullId: ${ghprbPullId}"
echo "build_id: ${build_id}"
function main {
generate_html_head
generate_html_body
generate_results
generate_html_footer
}
function generate_inference {
# echo "Generating inference"
awk -v framework="${framework}" -v fw_version="${fw_version}" -v model="${model}" -v os="${os}" -v platform=${platform} -F ';' '
BEGINE {
fp32_perf_bs = "nan";
fp32_perf_value = "nan";
fp32_perf_url = "nan";
fp32_acc_bs = "nan";
fp32_acc_value = "nan";
fp32_acc_url = "nan";
int8_perf_bs = "nan";
int8_perf_value = "nan";
int8_perf_url = "nan";
int8_acc_bs = "nan";
int8_acc_value = "nan";
int8_acc_url = "nan";
}{
if($1 == os && $2 == platform && $3 == framework && $4 == fw_version && $6 == model) {
// FP32
if($5 == "FP32") {
// Performance
if($8 == "Performance") {
fp32_perf_bs = $9;
fp32_perf_value = $10;
fp32_perf_url = $11;
}
// Accuracy
if($8 == "Accuracy") {
fp32_acc_bs = $9;
fp32_acc_value = $10;
fp32_acc_url = $11;
}
}
// INT8
if($5 == "INT8") {
// Performance
if($8 == "Performance") {
int8_perf_bs = $9;
int8_perf_value = $10;
int8_perf_url = $11;
}
// Accuracy
if($8 == "Accuracy") {
int8_acc_bs = $9;
int8_acc_value = $10;
int8_acc_url = $11;
}
}
}
}END {
printf("%s;%s;%s;%s;", int8_perf_bs,int8_perf_value,int8_acc_bs,int8_acc_value);
printf("%s;%s;%s;%s;", fp32_perf_bs,fp32_perf_value,fp32_acc_bs,fp32_acc_value);
printf("%s;%s;%s;%s;", int8_perf_url,int8_acc_url,fp32_perf_url,fp32_acc_url);
}
' "$1"
}
function generate_html_core {
echo "--- current values ---"
echo ${current_values}
echo "--- last values ---"
echo ${last_values}
tuning_strategy=$(grep "^${os};${platform};${framework};${fw_version};${model};" ${tuneLog} |awk -F';' '{print $6}')
tuning_time=$(grep "^${os};${platform};${framework};${fw_version};${model};" ${tuneLog} |awk -F';' '{print $7}')
tuning_count=$(grep "^${os};${platform};${framework};${fw_version};${model};" ${tuneLog} |awk -F';' '{print $8}')
tuning_log=$(grep "^${os};${platform};${framework};${fw_version};${model};" ${tuneLog} |awk -F';' '{print $9}')
echo "<tr><td rowspan=3>${platform}</td><td rowspan=3>${os}</td><td rowspan=3>${framework}</td><td rowspan=3>${fw_version}</td><td rowspan=3>${model}</td><td>New</td><td><a href=${tuning_log}>${tuning_strategy}</a></td>" >> ${output_dir}/report.html
echo "<td><a href=${tuning_log}>${tuning_time}</a></td><td><a href=${tuning_log}>${tuning_count}</a></td>" >> ${output_dir}/report.html
tuning_strategy=$(grep "^${os};${platform};${framework};${fw_version};${model};" ${tuneLogLast} |awk -F';' '{print $6}')
tuning_time=$(grep "^${os};${platform};${framework};${fw_version};${model};" ${tuneLogLast} |awk -F';' '{print $7}')
tuning_count=$(grep "^${os};${platform};${framework};${fw_version};${model};" ${tuneLogLast} |awk -F';' '{print $8}')
tuning_log=$(grep "^${os};${platform};${framework};${fw_version};${model};" ${tuneLogLast} |awk -F';' '{print $9}')
echo |awk -F ';' -v current_values="${current_values}" -v last_values="${last_values}" \
-v tuning_strategy="${tuning_strategy}" -v tuning_time="${tuning_time}" \
-v tuning_count="${tuning_count}" -v tuning_log="${tuning_log}" -F ';' '
function abs(x) { return x < 0 ? -x : x }
function show_new_last(batch, link, value, metric) {
if(value ~/[1-9]/) {
if (metric == "perf" || metric == "ratio") {
printf("<td>%s</td> <td><a href=%s>%.2f</a></td>\n",batch,link,value);
} else {
printf("<td>%s</td> <td><a href=%s>%.2f%</a></td>\n",batch,link,value*100);
}
} else {
if(link == "" || value == "N/A" || value == "unknown") {
printf("<td></td> <td></td>\n");
} else {
printf("<td>%s</td> <td><a href=%s>Failure</a></td>\n",batch,link);
}
}
}
function compare_current(int8_result, fp32_result, metric) {
if(int8_result ~/[1-9]/ && fp32_result ~/[1-9]/) {
if(metric == "acc") {
target = (int8_result - fp32_result) / fp32_result;
if(target >= -0.01) {
printf("<td rowspan=3 style=\"background-color:#90EE90\">%.2f %</td>", target*100);
}else if(target < -0.05) {
printf("<td rowspan=3 style=\"background-color:#FFD2D2\">%.2f %</td>", target*100);
job_status = "fail"
}else{
printf("<td rowspan=3>%.2f %</td>", target*100);
}
}else if(metric == "perf") {
target = int8_result / fp32_result;
if(target >= 1.5) {
printf("<td style=\"background-color:#90EE90\">%.2f</td>", target);
}else if(target < 1) {
printf("<td style=\"background-color:#FFD2D2\">%.2f</td>", target);
perf_status = "fail"
}else{
printf("<td>%.2f</td>", target);
}
}
else {
target = int8_result / fp32_result;
if(target >= 2) {
printf("<td rowspan=3 style=\"background-color:#90EE90\">%.2f</td>", target);
}else if(target < 1) {
printf("<td rowspan=3 style=\"background-color:#FFD2D2\">%.2f</td>", target);
job_status = "fail"
}else{
printf("<td rowspan=3>%.2f</td>", target);
}
}
}else {
printf("<td rowspan=3></td>");
}
}
function compare_result(new_result, previous_result, metric) {
if (new_result ~/[1-9]/ && previous_result ~/[1-9]/) {
if(metric == "acc") {
target = new_result - previous_result;
if(target > -0.00001 && target < 0.00001) {
status_png = "background-color:#90EE90";
} else {
status_png = "background-color:#FFD2D2";
job_status = "fail"
}
printf("<td style=\"%s\" colspan=2>%.2f %</td>", status_png, target*100);
} else {
target = new_result / previous_result;
if(target <= 1.084 && target >= 0.915) {
status_png = "background-color:#90EE90";
} else {
status_png = "background-color:#FFD2D2";
perf_status = "fail"
}
printf("<td style=\"%s\" colspan=2>%.2f</td>", status_png, target);
}
} else {
if((new_result == nan && previous_result == nan) || new_result == "unknown"){
printf("<td class=\"col-cell col-cell3\" colspan=2></td>");
} else{
job_status = "fail"
status_png = "background-color:#FFD2D2";
printf("<td style=\"%s\" colspan=2></td>", status_png);
}
}
}
function compare_ratio(int8_perf_value, fp32_perf_value, last_int8_perf_value, last_fp32_perf_value) {
if (int8_perf_value ~/[1-9]/ && fp32_perf_value ~/[1-9]/ && last_int8_perf_value ~/[1-9]/ && last_fp32_perf_value ~/[1-9]/) {
new_result = int8_perf_value / fp32_perf_value
previous_result = last_int8_perf_value / last_fp32_perf_value
target = new_result / previous_result;
if (target <= 1.084 && target >= 0.915) {
status_png = "background-color:#90EE90";
} else {
status_png = "background-color:#FFD2D2";
ratio_status = "fail"
}
printf("<td style=\"%s\">%.2f</td>", status_png, target);
} else {
if (new_result == nan && previous_result == nan) {
printf("<td class=\"col-cell col-cell3\"></td>");
} else {
if (new_result == nan) {
ratio_status = "fail"
status_png = "background-color:#FFD2D2";
printf("<td style=\"%s\"></td>", status_png);
} else {
printf("<td class=\"col-cell col-cell3\"></td>");
}
}
}
}
BEGIN {
job_status = "pass"
perf_status = "pass"
ratio_status = "pass"
// issue list
jira_mobilenet = "https://jira01.devtools.intel.com/browse/PADDLEQ-384";
jira_resnext = "https://jira01.devtools.intel.com/browse/PADDLEQ-387";
jira_ssdmobilenet = "https://jira01.devtools.intel.com/browse/PADDLEQ-406";
}{
// Current values
split(current_values,current_value,";");
// Current
// INT8 Performance results
int8_perf_batch=current_value[1]
int8_perf_value=current_value[2]
int8_perf_url=current_value[9]
show_new_last(int8_perf_batch, int8_perf_url, int8_perf_value, "perf");
// INT8 Accuracy results
int8_acc_batch=current_value[3]
int8_acc_value=current_value[4]
int8_acc_url=current_value[10]
show_new_last(int8_acc_batch, int8_acc_url, int8_acc_value, "acc");
// FP32 Performance results
fp32_perf_batch=current_value[5]
fp32_perf_value=current_value[6]
fp32_perf_url=current_value[11]
show_new_last(fp32_perf_batch, fp32_perf_url, fp32_perf_value, "perf");
// FP32 Accuracy results
fp32_acc_batch=current_value[7]
fp32_acc_value=current_value[8]
fp32_acc_url=current_value[12]
show_new_last(fp32_acc_batch, fp32_acc_url, fp32_acc_value, "acc");
// Compare Current
compare_current(int8_perf_value, fp32_perf_value, "perf");
compare_current(int8_acc_value, fp32_acc_value, "acc");
// Last values
split(last_values,last_value,";");
// Last
printf("</tr>\n<tr><td>Last</td><td><a href=%4$s>%1$s</a></td><td><a href=%4$s>%2$s</a></td><td><a href=%4$s>%3$s</a></td>", tuning_strategy, tuning_time, tuning_count, tuning_log);
// Show last INT8 Performance results
last_int8_perf_batch=last_value[1]
last_int8_perf_value=last_value[2]
last_int8_perf_url=last_value[9]
show_new_last(last_int8_perf_batch, last_int8_perf_url, last_int8_perf_value, "perf");
// Show last INT8 Accuracy results
last_int8_acc_batch=last_value[3]
last_int8_acc_value=last_value[4]
last_int8_acc_url=last_value[10]
show_new_last(last_int8_acc_batch, last_int8_acc_url, last_int8_acc_value, "acc");
// Show last FP32 Performance results
last_fp32_perf_batch=last_value[5]
last_fp32_perf_value=last_value[6]
last_fp32_perf_url=last_value[11]
show_new_last(last_fp32_perf_batch, last_fp32_perf_url, last_fp32_perf_value, "perf");
// Show last FP32 Accuracy results
last_fp32_acc_batch=last_value[7]
last_fp32_acc_value=last_value[8]
last_fp32_acc_url=last_value[12]
show_new_last(last_fp32_acc_batch, last_fp32_acc_url, last_fp32_acc_value, "acc");
compare_current(last_int8_perf_value, last_fp32_perf_value, "perf");
printf("</tr>")
// current vs last
printf("</tr>\n<tr><td>New/Last</td><td colspan=3 class=\"col-cell3\"></td>");
// Compare INT8 Performance results
compare_result(int8_perf_value, last_int8_perf_value,"perf");
// Compare INT8 Accuracy results
compare_result(int8_acc_value, last_int8_acc_value, "acc");
// Compare FP32 Performance results
compare_result(fp32_perf_value, last_fp32_perf_value, "perf");
// Compare FP32 Accuracy results
compare_result(fp32_acc_value, last_fp32_acc_value, "acc");
// Compare INT8 FP32 Performance ratio
compare_ratio(int8_perf_value, fp32_perf_value, last_int8_perf_value, last_fp32_perf_value);
printf("</tr>\n");
status = (perf_status == "fail" && ratio_status == "fail") ? "fail" : "pass"
status = (job_status == "fail") ? "fail" : status
} END{
printf("\n%s", status);
}
' >> ${output_dir}/report.html
job_state=$(tail -1 ${WORKSPACE}/report.html)
sed -i '$s/.*//' ${WORKSPACE}/report.html
if [ ${job_state} == 'fail' ]; then
echo "====== perf_reg ======"
echo "##vso[task.setvariable variable=is_perf_reg]true"
fi
}
function generate_results {
echo "Generating tuning results"
oses=$(sed '1d' ${summaryLog} |cut -d';' -f1 | awk '!a[$0]++')
echo ${oses}
for os in ${oses[@]}
do
platforms=$(sed '1d' ${summaryLog} |grep "^${os}" |cut -d';' -f2 | awk '!a[$0]++')
echo ${platforms}
for platform in ${platforms[@]}
do
frameworks=$(sed '1d' ${summaryLog} |grep "^${os};${platform}" |cut -d';' -f3 | awk '!a[$0]++')
echo ${frameworks}
for framework in ${frameworks[@]}
do
fw_versions=$(sed '1d' ${summaryLog} |grep "^${os};${platform};${framework}" |cut -d';' -f4 | awk '!a[$0]++')
echo ${fw_versions}
for fw_version in ${fw_versions[@]}
do
models=$(sed '1d' ${summaryLog} |grep "^${os};${platform};${framework};${fw_version}" |cut -d';' -f6 | awk '!a[$0]++')
echo ${models}
for model in ${models[@]}
do
echo "--- processing model ---"
echo ${model}
current_values=$(generate_inference ${summaryLog})
echo "| current value |"
echo ${current_values}
last_values=$(generate_inference ${summaryLogLast})
echo "| last value |"
echo ${last_values}
generate_html_core ${current_values} ${last_values}
done
done
done
done
done
}
function generate_html_body {
MR_TITLE=''
Test_Info_Title=''
Test_Info=''
if [ "${qtools_branch}" == "" ];
then
commit_id=$(echo ${ghprbActualCommit} |awk '{print substr($1,1,7)}')
MR_TITLE="[ <a href='${repo_url}/pull/${ghprbPullId}'>PR-${ghprbPullId}</a> ]"
Test_Info_Title="<th colspan="2">Source Branch</th> <th colspan="4">Target Branch</th> <th colspan="4">Commit</th> "
Test_Info="<td colspan="2">${MR_source_branch}</td> <td colspan="4"><a href='${repo_url}/tree/${MR_target_branch}'>${MR_target_branch}</a></td> <td colspan="4"><a href='${MR_source_repo}/commit/${source_commit_id}'>${source_commit_id:0:6}</a></td>"
else
Test_Info_Title="<th colspan="4">Test Branch</th> <th colspan="4">Commit ID</th> "
Test_Info="<th colspan="4">${qtools_branch}</th> <th colspan="4">${qtools_commit}</th> "
fi
cat >> ${output_dir}/report.html << eof
<body>
<div id="main">
<h1 align="center">Neural Compressor Tuning Tests ${MR_TITLE}
[ <a
href="https://dev.azure.com/lpot-inc/neural-compressor/_build/results?buildId=${build_id}">Job-${build_id}</a>
]</h1>
<h1 align="center">Test Status: ${Jenkins_job_status}</h1>
<h2>Summary</h2>
<table class="features-table">
<tr>
<th>Repo</th>
${Test_Info_Title}
</tr>
<tr>
<td><a href="https://github.com/intel/neural-compressor">neural-compressor</a></td>
${Test_Info}
</tr>
</table>
eof
echo "Generating benchmarks table"
cat >> ${output_dir}/report.html << eof
<h2>Benchmark</h2>
<table class="features-table">
<tr>
<th rowspan="2">Platform</th>
<th rowspan="2">System</th>
<th rowspan="2">Framework</th>
<th rowspan="2">Version</th>
<th rowspan="2">Model</th>
<th rowspan="2">VS</th>
<th rowspan="2">Tuning<br>Strategy</th>
<th rowspan="2">Tuning<br>Time(s)</th>
<th rowspan="2">Tuning<br>Count</th>
<th colspan="4">INT8</th>
<th colspan="4">FP32</th>
<th colspan="2" class="col-cell col-cell1 col-cellh">Ratio</th>
</tr>
<tr>
<th>bs</th>
<th>imgs/s</th>
<th>bs</th>
<th>top1</th>
<th>bs</th>
<th>imgs/s</th>
<th>bs</th>
<th>top1</th>
<th class="col-cell col-cell1">Throughput<br><font size="2px">INT8/FP32</font></th>
<th class="col-cell col-cell1">Accuracy<br><font size="2px">(INT8-FP32)/FP32</font></th>
</tr>
eof
}
function generate_html_footer {
cat >> ${output_dir}/report.html << eof
<tr>
<td colspan="17"><font color="#d6776f">Note: </font>All data tested on Azure Cloud.</td>
<td colspan="2" class="col-cell col-cell1 col-cellf"></td>
</tr>
</table>
</div>
</body>
</html>
eof
}
function generate_html_head {
cat > ${output_dir}/report.html << eof
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html lang="en">
<head>
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
<title>Daily Tests - TensorFlow - Jenkins</title>
<style type="text/css">
body
{
margin: 0;
padding: 0;
background: white no-repeat left top;
}
#main
{
// width: 100%;
margin: 20px auto 10px auto;
background: white;
-moz-border-radius: 8px;
-webkit-border-radius: 8px;
padding: 0 30px 30px 30px;
border: 1px solid #adaa9f;
-moz-box-shadow: 0 2px 2px #9c9c9c;
-webkit-box-shadow: 0 2px 2px #9c9c9c;
}
.features-table
{
width: 100%;
margin: 0 auto;
border-collapse: separate;
border-spacing: 0;
text-shadow: 0 1px 0 #fff;
color: #2a2a2a;
background: #fafafa;
background-image: -moz-linear-gradient(top, #fff, #eaeaea, #fff); /* Firefox 3.6 */
background-image: -webkit-gradient(linear,center bottom,center top,from(#fff),color-stop(0.5, #eaeaea),to(#fff));
font-family: Verdana,Arial,Helvetica
}
.features-table th,td
{
text-align: center;
height: 25px;
line-height: 25px;
padding: 0 8px;
border: 1px solid #cdcdcd;
box-shadow: 0 1px 0 white;
-moz-box-shadow: 0 1px 0 white;
-webkit-box-shadow: 0 1px 0 white;
white-space: nowrap;
}
.no-border th
{
box-shadow: none;
-moz-box-shadow: none;
-webkit-box-shadow: none;
}
.col-cell
{
text-align: center;
width: 150px;
font: normal 1em Verdana, Arial, Helvetica;
}
.col-cell3
{
background: #efefef;
background: rgba(144,144,144,0.15);
}
.col-cell1, .col-cell2
{
background: #B0C4DE;
background: rgba(176,196,222,0.3);
}
.col-cellh
{
font: bold 1.3em 'trebuchet MS', 'Lucida Sans', Arial;
-moz-border-radius-topright: 10px;
-moz-border-radius-topleft: 10px;
border-top-right-radius: 10px;
border-top-left-radius: 10px;
border-top: 1px solid #eaeaea !important;
}
.col-cellf
{
font: bold 1.4em Georgia;
-moz-border-radius-bottomright: 10px;
-moz-border-radius-bottomleft: 10px;
border-bottom-right-radius: 10px;
border-bottom-left-radius: 10px;
border-bottom: 1px solid #dadada !important;
}
</style>
</head>
eof
}
main

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Benchmarking: measure the model performance with the objective settings."""
import argparse
import subprocess
import numpy as np
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument("--cores_per_instance", type=int, required=True)
parser.add_argument("--num_of_instance", type=int, required=True)
args = parser.parse_args()
def get_architecture():
"""Get the architecture name of the system."""
p1 = subprocess.Popen("lscpu", stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
p2 = subprocess.Popen(["grep", "Architecture"], stdin=p1.stdout, stdout=subprocess.PIPE)
p3 = subprocess.Popen(["cut", "-d", ":", "-f2"], stdin=p2.stdout, stdout=subprocess.PIPE)
res = None
for line in iter(p3.stdout.readline, b""):
res = line.decode("utf-8").strip()
return res
def get_threads_per_core():
"""Get the threads per core."""
p1 = subprocess.Popen("lscpu", stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
p2 = subprocess.Popen(["grep", "Thread(s) per core"], stdin=p1.stdout, stdout=subprocess.PIPE)
p3 = subprocess.Popen(["cut", "-d", ":", "-f2"], stdin=p2.stdout, stdout=subprocess.PIPE)
res = None
for line in iter(p3.stdout.readline, b""):
res = line.decode("utf-8").strip()
return res
def get_threads():
"""Get the list of threads."""
p1 = subprocess.Popen(["cat", "/proc/cpuinfo"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
p2 = subprocess.Popen(["grep", "processor"], stdin=p1.stdout, stdout=subprocess.PIPE)
p3 = subprocess.Popen(["cut", "-d", ":", "-f2"], stdin=p2.stdout, stdout=subprocess.PIPE)
res = []
for line in iter(p3.stdout.readline, b""):
res.append(line.decode("utf-8").strip())
return res
def get_physical_ids():
"""Get the list of sockets."""
p1 = subprocess.Popen(["cat", "/proc/cpuinfo"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
p2 = subprocess.Popen(["grep", "physical id"], stdin=p1.stdout, stdout=subprocess.PIPE)
p3 = subprocess.Popen(["cut", "-d", ":", "-f2"], stdin=p2.stdout, stdout=subprocess.PIPE)
res = []
for line in iter(p3.stdout.readline, b""):
res.append(line.decode("utf-8").strip())
return res
def get_core_ids():
"""Get the ids list of the cores."""
p1 = subprocess.Popen(["cat", "/proc/cpuinfo"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
p2 = subprocess.Popen(["grep", "core id"], stdin=p1.stdout, stdout=subprocess.PIPE)
p3 = subprocess.Popen(["cut", "-d", ":", "-f2"], stdin=p2.stdout, stdout=subprocess.PIPE)
res = []
for line in iter(p3.stdout.readline, b""):
res.append(line.decode("utf-8").strip())
return res
def get_bounded_threads(core_ids, threads, sockets):
"""Return the threads id list that we will bind instances to."""
res = []
existing_socket_core_list = []
for idx, x in enumerate(core_ids):
socket_core = sockets[idx] + ":" + x
if socket_core not in existing_socket_core_list:
res.append(int(threads[idx]))
existing_socket_core_list.append(socket_core)
return res
def config_instance(cores_per_instance, num_of_instance):
"""Configure the multi-instance commands and trigger benchmark with sub process."""
core = []
if get_architecture() == "aarch64" and int(get_threads_per_core()) > 1:
raise OSError("Currently no support on AMD with hyperthreads")
else:
bounded_threads = get_bounded_threads(get_core_ids(), get_threads(), get_physical_ids())
for i in range(0, num_of_instance):
if get_architecture() == "x86_64":
core_list_idx = np.arange(0, cores_per_instance) + i * cores_per_instance
core_list = np.array(bounded_threads)[core_list_idx]
else:
core_list = np.arange(0, cores_per_instance) + i * cores_per_instance
core.append(core_list.tolist())
for i in range(len(core)):
core[i] = [str(j) for j in core[i]]
core[i] = ",".join(core[i])
core = ";".join(core)
return core
if __name__ == "__main__":
print(config_instance(args.cores_per_instance, args.num_of_instance))

View File

@@ -0,0 +1,140 @@
#!/bin/bash
set -eo pipefail
source /neural-compressor/.azure-pipelines/scripts/change_color.sh
# get parameters
PATTERN='[-a-zA-Z0-9_]*='
SCRIPTS_PATH="/neural-compressor/.azure-pipelines/scripts/models"
for i in "$@"; do
case $i in
--framework=*)
framework=`echo $i | sed "s/${PATTERN}//"`;;
--model=*)
model=`echo $i | sed "s/${PATTERN}//"`;;
--input_model=*)
input_model=`echo $i | sed "s/${PATTERN}//"`;;
--benchmark_cmd=*)
benchmark_cmd=`echo $i | sed "s/${PATTERN}//"`;;
--log_dir=*)
log_dir=`echo $i | sed "s/${PATTERN}//"`;;
--new_benchmark=*)
new_benchmark=`echo $i | sed "s/${PATTERN}//"`;;
--precision=*)
precision=`echo $i | sed "s/${PATTERN}//"`;;
--stage=*)
stage=`echo $i | sed "s/${PATTERN}//"`;;
--USE_TUNE_ACC=*)
USE_TUNE_ACC=`echo $i | sed "s/${PATTERN}//"`;;
--PERF_STABLE_CHECK=*)
PERF_STABLE_CHECK=`echo $i | sed "s/${PATTERN}//"`;;
--BUILD_BUILDID=*)
BUILD_BUILDID=`echo $i | sed "s/${PATTERN}//"`;;
*)
echo "Parameter $i not recognized."; exit 1;;
esac
done
$BOLD_YELLOW && echo "-------- run_benchmark_common --------" && $RESET
main() {
# run accuracy
echo "USE_TUNE_ACC=${USE_TUNE_ACC}, PERF_STABLE_CHECK=${PERF_STABLE_CHECK}"
# USE_TUNE_ACC==true means using accuracy results from tuning log
if [ ${USE_TUNE_ACC} == "false" ]; then
run_accuracy
fi
# run performance
if [ ${PERF_STABLE_CHECK} == "false" ]; then
run_performance
else
max_loop=3
gap=(0.05 0.05 0.1)
for ((iter = 0; iter < ${max_loop}; iter++)); do
run_performance
{
check_perf_gap ${gap[${iter}]}
exit_code=$?
} || true
if [ ${exit_code} -ne 0 ]; then
$BOLD_RED && echo "FAILED with performance gap!!" && $RESET
else
$BOLD_GREEN && echo "SUCCEED!!" && $RESET
break
fi
done
exit ${exit_code}
fi
}
function check_perf_gap() {
python -u ${SCRIPTS_PATH}/collect_log_model.py \
--framework=${framework} \
--fwk_ver=${fwk_ver} \
--model=${model} \
--logs_dir="${log_dir}" \
--output_dir="${log_dir}" \
--build_id=${BUILD_BUILDID} \
--stage=${stage} \
--gap=$1
}
function run_performance() {
cmd="${benchmark_cmd} --input_model=${input_model}"
if [ "${new_benchmark}" == "true" ]; then
$BOLD_YELLOW && echo "run with internal benchmark..." && $RESET
export NUM_OF_INSTANCE=2
export CORES_PER_INSTANCE=4
eval ${cmd} 2>&1 | tee ${log_dir}/${framework}-${model}-performance-${precision}.log
else
$BOLD_YELLOW && echo "run with external multiInstance benchmark..." && $RESET
multiInstance
fi
}
function run_accuracy() {
$BOLD_YELLOW && echo "run tuning accuracy in precision ${precision}" && $RESET
eval "${benchmark_cmd} --input_model=${input_model} --mode=accuracy" 2>&1 | tee ${log_dir}/${framework}-${model}-accuracy-${precision}.log
}
function multiInstance() {
ncores_per_socket=${ncores_per_socket:=$(lscpu | grep 'Core(s) per socket' | cut -d: -f2 | xargs echo -n)}
$BOLD_YELLOW && echo "Executing multi instance benchmark" && $RESET
ncores_per_instance=4
$BOLD_YELLOW && echo "ncores_per_socket=${ncores_per_socket}, ncores_per_instance=${ncores_per_instance}" && $RESET
logFile="${log_dir}/${framework}-${model}-performance-${precision}"
benchmark_pids=()
core_list=$(python ${SCRIPTS_PATH}/new_benchmark.py --cores_per_instance=${ncores_per_instance} --num_of_instance=$(expr $ncores_per_socket / $ncores_per_instance))
core_list=($(echo $core_list | tr ';' ' '))
for ((j = 0; $j < $(expr $ncores_per_socket / $ncores_per_instance); j = $(($j + 1)))); do
$BOLD_GREEN && echo "OMP_NUM_THREADS=${ncores_per_instance} numactl --localalloc --physcpubind=${core_list[${j}]} ${cmd} 2>&1 | tee ${logFile}-${ncores_per_socket}-${ncores_per_instance}-${j}.log &" && $RESET
OMP_NUM_THREADS=${ncores_per_instance} numactl --localalloc --physcpubind=${core_list[${j}]} ${cmd} 2>&1 | tee ${logFile}-${ncores_per_socket}-${ncores_per_instance}-${j}.log &
benchmark_pids+=($!)
done
status="SUCCESS"
for pid in "${benchmark_pids[@]}"; do
wait $pid
exit_code=$?
$BOLD_YELLOW && echo "Detected exit code: ${exit_code}" && $RESET
if [ ${exit_code} == 0 ]; then
$BOLD_GREEN && echo "Process ${pid} succeeded" && $RESET
else
$BOLD_RED && echo "Process ${pid} failed" && $RESET
status="FAILURE"
fi
done
$BOLD_YELLOW && echo "Benchmark process status: ${status}" && $RESET
if [ ${status} == "FAILURE" ]; then
$BOLD_RED && echo "Benchmark process returned non-zero exit code." && $RESET
exit 1
fi
}
main

View File

@@ -0,0 +1,177 @@
#!/bin/bash
set -eo pipefail
source /neural-compressor/.azure-pipelines/scripts/change_color.sh
# get parameters
PATTERN='[-a-zA-Z0-9_]*='
for i in "$@"
do
case $i in
--yaml=*)
yaml=`echo $i | sed "s/${PATTERN}//"`;;
--framework=*)
framework=`echo $i | sed "s/${PATTERN}//"`;;
--fwk_ver=*)
fwk_ver=`echo $i | sed "s/${PATTERN}//"`;;
--torch_vision_ver=*)
torch_vision_ver=`echo $i | sed "s/${PATTERN}//"`;;
--model=*)
model=`echo $i | sed "s/${PATTERN}//"`;;
--model_src_dir=*)
model_src_dir=`echo $i | sed "s/${PATTERN}//"`;;
--dataset_location=*)
dataset_location=`echo $i | sed "s/${PATTERN}//"`;;
--input_model=*)
input_model=`echo $i | sed "s/${PATTERN}//"`;;
--batch_size=*)
batch_size=`echo $i | sed "s/${PATTERN}//"`;;
--strategy=*)
strategy=`echo $i | sed "s/${PATTERN}//"`;;
--new_benchmark=*)
new_benchmark=`echo $i | sed "s/${PATTERN}//"`;;
--inc_new_api=*)
inc_new_api=`echo $i | sed "s/${PATTERN}//"`;;
--tuning_cmd=*)
tuning_cmd=`echo $i | sed "s/${PATTERN}//"`;;
--benchmark_cmd=*)
benchmark_cmd=`echo $i | sed "s/${PATTERN}//"`;;
--mode=*)
mode=`echo $i | sed "s/${PATTERN}//"`;;
--USE_TUNE_ACC=*)
USE_TUNE_ACC=`echo $i | sed "s/${PATTERN}//"`;;
--PERF_STABLE_CHECK=*)
PERF_STABLE_CHECK=`echo $i | sed "s/${PATTERN}//"`;;
--BUILD_BUILDID=*)
BUILD_BUILDID=`echo $i | sed "s/${PATTERN}//"`;;
*)
echo "Parameter $i not recognized."; exit 1;;
esac
done
function check_results() {
local control_phrase=$1
if [ $(grep "${control_phrase}" ${log_dir}/${model}/${framework}-${model}-tune.log | wc -l) == 0 ];then
$BOLD_RED && echo "====== Quantization FAILED!! ======" && $RESET; exit 1
fi
}
log_dir="/neural-compressor/.azure-pipelines/scripts/models"
SCRIPTS_PATH="/neural-compressor/.azure-pipelines/scripts/models"
if [[ "${inc_new_api}" == "3x"* ]]; then
WORK_SOURCE_DIR="/neural-compressor/examples/3.x_api/${framework}"
else
WORK_SOURCE_DIR="/neural-compressor/examples/${framework}"
fi
$BOLD_YELLOW && echo "processing ${framework}-${fwk_ver}-${model}" && $RESET
if [ "${mode}" == "env_setup" ]; then
/bin/bash env_setup.sh \
--yaml=${yaml} \
--framework=${framework} \
--fwk_ver=${fwk_ver} \
--torch_vision_ver=${torch_vision_ver} \
--model=${model} \
--model_src_dir=${model_src_dir} \
--dataset_location=${dataset_location} \
--batch_size=${batch_size} \
--strategy=${strategy} \
--new_benchmark=${new_benchmark} \
--inc_new_api="${inc_new_api}"
elif [ "${mode}" == "tuning" ]; then
if [ "${framework}" == "onnxrt" ]; then
output_model=${log_dir}/${model}/${framework}-${model}-tune.onnx
elif [ "${framework}" == "tensorflow" ]; then
output_model=${log_dir}/${model}/${framework}-${model}-tune.pb
fi
[[ ${output_model} ]] && tuning_cmd="${tuning_cmd} --output_model=${output_model}"
cd ${WORK_SOURCE_DIR}/${model_src_dir}
# for int4 models add "--accuracy" to run tuning after quantize
if [[ "${model}" == *"int4"* ]]; then
sed -i "s|--quantize|--quantize --accuracy --load|g" run_quant.sh
fi
$BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET
$BOLD_YELLOW && echo "tuning_cmd is === ${tuning_cmd}" && $RESET
$BOLD_YELLOW && echo "======== run tuning ========" && $RESET
/bin/bash ${SCRIPTS_PATH}/run_tuning_common.sh \
--tuning_cmd="${tuning_cmd}" \
--strategy=${strategy} \
2>&1 | tee -a ${log_dir}/${model}/${framework}-${model}-tune.log
$BOLD_YELLOW && echo "====== check tuning status. ======" && $RESET
if [[ "${inc_new_api}" == "3x"* ]]; then
control_phrase_1="Preparation end."
check_results $control_phrase_1
control_phrase_2="Conversion end."
check_results $control_phrase_2
else
control_phrase="model which meet accuracy goal."
check_results $control_phrase
if [ $(grep "${control_phrase}" ${log_dir}/${model}/${framework}-${model}-tune.log | grep "Not found" | wc -l) == 1 ];then
$BOLD_RED && echo "====== Quantization FAILED!! ======" && $RESET; exit 1
fi
fi
$BOLD_GREEN && echo "====== Quantization SUCCEED!! ======" && $RESET
elif [ "${mode}" == "fp32_benchmark" ]; then
cd ${WORK_SOURCE_DIR}/${model_src_dir}
$BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET
$BOLD_YELLOW && echo "benchmark_cmd is ${benchmark_cmd}" && $RESET
$BOLD_YELLOW && echo "====== run benchmark fp32 =======" && $RESET
/bin/bash ${SCRIPTS_PATH}/run_benchmark_common.sh \
--framework=${framework} \
--model=${model} \
--input_model=${input_model} \
--benchmark_cmd="${benchmark_cmd}" \
--log_dir="${log_dir}/${model}" \
--new_benchmark=${new_benchmark} \
--precision="fp32" \
--stage=${mode} \
--USE_TUNE_ACC=${USE_TUNE_ACC} \
--PERF_STABLE_CHECK=${PERF_STABLE_CHECK} \
--BUILD_BUILDID=${BUILD_BUILDID}
elif [ "${mode}" == "int8_benchmark" ]; then
cd ${WORK_SOURCE_DIR}/${model_src_dir}
$BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET
$BOLD_YELLOW && echo "benchmark_cmd is ${benchmark_cmd}" && $RESET
$BOLD_YELLOW && echo "====== run benchmark int8 =======" && $RESET
if [[ "${framework}" == "onnxrt" ]]; then
model_name="${log_dir}/${model}/${framework}-${model}-tune.onnx"
elif [[ "${framework}" == "tensorflow" ]]; then
model_name="${log_dir}/${model}/${framework}-${model}-tune.pb"
elif [[ "${framework}" == "pytorch" ]]; then
model_name=${input_model}
benchmark_cmd="${benchmark_cmd} --int8=true"
fi
/bin/bash ${SCRIPTS_PATH}/run_benchmark_common.sh \
--framework=${framework} \
--model=${model} \
--input_model="${model_name}" \
--benchmark_cmd="${benchmark_cmd}" \
--log_dir="${log_dir}/${model}" \
--new_benchmark=${new_benchmark} \
--precision="int8" \
--stage=${mode} \
--USE_TUNE_ACC=${USE_TUNE_ACC} \
--PERF_STABLE_CHECK=${PERF_STABLE_CHECK} \
--BUILD_BUILDID=${BUILD_BUILDID}
elif [ "${mode}" == "collect_log" ]; then
cd ${WORK_SOURCE_DIR}/${model_src_dir}
$BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET
$BOLD_YELLOW && echo "====== collect logs of model ${model} =======" && $RESET
if [ "${framework}" == "pytorch" ] && [ "${fwk_ver}" == "latest" ]; then
fwk_ver=$(python -c "import torch; print(torch.__version__)")
fi
python -u ${SCRIPTS_PATH}/collect_log_model.py \
--framework=${framework} \
--fwk_ver=${fwk_ver} \
--model=${model} \
--logs_dir="${log_dir}/${model}" \
--output_dir="${log_dir}/${model}" \
--build_id=${BUILD_BUILDID} \
--stage=${mode} \
--inc_new_api="${inc_new_api}"
$BOLD_YELLOW && echo "====== Finish collect logs =======" && $RESET
fi

View File

@@ -0,0 +1,62 @@
#!/bin/bash
set -eo pipefail
# get parameters
PATTERN='[-a-zA-Z0-9_]*='
for i in "$@"
do
case $i in
--model=*)
model=`echo $i | sed "s/${PATTERN}//"`;;
--mode=*)
mode=`echo $i | sed "s/${PATTERN}//"`;;
--USE_TUNE_ACC=*)
USE_TUNE_ACC=`echo $i | sed "s/${PATTERN}//"`;;
--PERF_STABLE_CHECK=*)
PERF_STABLE_CHECK=`echo $i | sed "s/${PATTERN}//"`;;
--BUILD_BUILDID=*)
BUILD_BUILDID=`echo $i | sed "s/${PATTERN}//"`;;
*)
echo "Parameter $i not recognized."; exit 1;;
esac
done
echo "specify FWs version..."
source /neural-compressor/.azure-pipelines/scripts/fwk_version.sh 'latest'
FRAMEWORK="onnxrt"
FRAMEWORK_VERSION=${onnxruntime_version}
inc_new_api=false
# ======== set up config for onnxrt models ========
if [ "${model}" == "resnet50-v1-12" ]; then
model_src_dir="image_recognition/onnx_model_zoo/resnet50/quantization/ptq_static"
dataset_location="/tf_dataset2/datasets/imagenet/ImagenetRaw/ImagenetRaw_small_5000/ILSVRC2012_img_val"
input_model="/tf_dataset2/models/onnx/resnet50-v1-12/resnet50-v1-12.onnx"
yaml="resnet50_v1_5.yaml"
strategy="basic"
batch_size=1
new_benchmark=true
inc_new_api=true
tuning_cmd="bash run_quant.sh --input_model=${input_model} --dataset_location=${dataset_location}"
benchmark_cmd="bash run_benchmark.sh --config=${yaml} --mode=performance --dataset_location=${dataset_location}"
fi
/bin/bash run_model_trigger_common.sh \
--yaml=${yaml} \
--framework=${FRAMEWORK} \
--fwk_ver=${FRAMEWORK_VERSION} \
--model=${model} \
--model_src_dir=${model_src_dir} \
--dataset_location=${dataset_location} \
--input_model=${input_model} \
--batch_size=${batch_size} \
--strategy=${strategy} \
--new_benchmark=${new_benchmark} \
--tuning_cmd="${tuning_cmd}" \
--benchmark_cmd="${benchmark_cmd}" \
--inc_new_api="${inc_new_api}" \
--mode=${mode} \
--USE_TUNE_ACC=${USE_TUNE_ACC} \
--PERF_STABLE_CHECK=${PERF_STABLE_CHECK} \
--BUILD_BUILDID=${BUILD_BUILDID}

View File

@@ -0,0 +1,100 @@
#!/bin/bash
set -eo pipefail
# get parameters
PATTERN='[-a-zA-Z0-9_]*='
for i in "$@"
do
case $i in
--model=*)
model=`echo $i | sed "s/${PATTERN}//"`;;
--mode=*)
mode=`echo $i | sed "s/${PATTERN}//"`;;
--USE_TUNE_ACC=*)
USE_TUNE_ACC=`echo $i | sed "s/${PATTERN}//"`;;
--PERF_STABLE_CHECK=*)
PERF_STABLE_CHECK=`echo $i | sed "s/${PATTERN}//"`;;
--BUILD_BUILDID=*)
BUILD_BUILDID=`echo $i | sed "s/${PATTERN}//"`;;
*)
echo "Parameter $i not recognized."; exit 1;;
esac
done
dataset_location=""
input_model=""
yaml=""
strategy=""
batch_size=""
new_benchmark=true
inc_new_api=true
benchmark_cmd=""
# ======== set up config for pytorch models ========
if [ "${model}" == "resnet18" ]; then
model_src_dir="image_recognition/torchvision_models/quantization/ptq/cpu/eager"
dataset_location="/tf_dataset2/datasets/mini-imageraw"
input_model=""
yaml="conf.yaml"
strategy="bayesian"
batch_size=1
new_benchmark=false
inc_new_api=false
tuning_cmd="bash run_tuning.sh --topology=resnet18 --dataset_location=${dataset_location} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --topology=resnet18 --dataset_location=${dataset_location} --mode=benchmark --batch_size=${batch_size} --iters=500"
elif [ "${model}" == "resnet18_fx" ]; then
model_src_dir="image_recognition/torchvision_models/quantization/ptq/cpu/fx/"
dataset_location="/tf_dataset2/datasets/mini-imageraw"
input_model="resnet18"
yaml=""
strategy="basic"
batch_size=1
new_benchmark=true
inc_new_api=true
tuning_cmd="bash run_quant.sh --topology=resnet18 --dataset_location=${dataset_location} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --topology=resnet18 --dataset_location=${dataset_location} --mode=performance --batch_size=${batch_size} --iters=500"
elif [ "${model}" == "opt_125m_woq_gptq_int4" ]; then
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
inc_new_api=3x_pt
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4"
elif [ "${model}" == "opt_125m_woq_gptq_nf4_dq_bnb" ]; then
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
inc_new_api=3x_pt
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_nf4_dq_bnb"
elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_ggml" ]; then
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
inc_new_api=3x_pt
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_ggml"
fi
echo "Specify FWs version..."
FRAMEWORK="pytorch"
source /neural-compressor/.azure-pipelines/scripts/fwk_version.sh 'latest'
if [[ "${inc_new_api}" == "3x"* ]]; then
FRAMEWORK_VERSION="latest"
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
else
FRAMEWORK_VERSION=${pytorch_version}
TORCH_VISION_VERSION=${torchvision_version}
fi
/bin/bash run_model_trigger_common.sh \
--yaml=${yaml} \
--framework=${FRAMEWORK} \
--fwk_ver=${FRAMEWORK_VERSION} \
--torch_vision_ver=${TORCH_VISION_VERSION} \
--model=${model} \
--model_src_dir=${model_src_dir} \
--dataset_location=${dataset_location} \
--input_model=${input_model} \
--batch_size=${batch_size} \
--strategy=${strategy} \
--new_benchmark=${new_benchmark} \
--tuning_cmd="${tuning_cmd}" \
--benchmark_cmd="${benchmark_cmd}" \
--inc_new_api="${inc_new_api}" \
--mode=${mode} \
--USE_TUNE_ACC=${USE_TUNE_ACC} \
--PERF_STABLE_CHECK=${PERF_STABLE_CHECK} \
--BUILD_BUILDID=${BUILD_BUILDID}

View File

@@ -0,0 +1,118 @@
#!/bin/bash
set -eo pipefail
# get parameters
PATTERN='[-a-zA-Z0-9_]*='
for i in "$@"
do
case $i in
--model=*)
model=`echo $i | sed "s/${PATTERN}//"`;;
--mode=*)
mode=`echo $i | sed "s/${PATTERN}//"`;;
--USE_TUNE_ACC=*)
USE_TUNE_ACC=`echo $i | sed "s/${PATTERN}//"`;;
--PERF_STABLE_CHECK=*)
PERF_STABLE_CHECK=`echo $i | sed "s/${PATTERN}//"`;;
--BUILD_BUILDID=*)
BUILD_BUILDID=`echo $i | sed "s/${PATTERN}//"`;;
*)
echo "Parameter $i not recognized."; exit 1;;
esac
done
echo "specify FWs version..."
source /neural-compressor/.azure-pipelines/scripts/fwk_version.sh 'latest'
FRAMEWORK="tensorflow"
FRAMEWORK_VERSION=${tensorflow_version}
inc_new_api=false
# ======== set up config for tensorflow models ========
if [ "${model}" == "resnet50v1.5" ]; then
model_src_dir="image_recognition/tensorflow_models/resnet50_v1_5/quantization/ptq"
dataset_location="/tf_dataset/dataset/TF_mini_imagenet"
input_model="/tf_dataset/pre-trained-models/resnet50v1_5/fp32/resnet50_v1.pb"
new_benchmark=true
inc_new_api=true
tuning_cmd="bash run_quant.sh --dataset_location=${dataset_location} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --dataset_location=${dataset_location} --batch_size=1 --mode=performance"
elif [ "${model}" == "ssd_resnet50_v1" ];then
model_src_dir="object_detection/tensorflow_models/ssd_resnet50_v1/quantization/ptq"
dataset_location="/tf_dataset/tensorflow/mini-coco-100.record"
input_model="/tf_dataset/pre-train-model-oob/object_detection/ssd_resnet50_v1/frozen_inference_graph.pb"
new_benchmark=true
inc_new_api=true
tuning_cmd="bash run_quant.sh --dataset_location=${dataset_location} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --dataset_location=${dataset_location} --batch_size=1 --mode=performance"
elif [ "${model}" == "ssd_mobilenet_v1_ckpt" ];then
model_src_dir="object_detection/tensorflow_models/ssd_mobilenet_v1/quantization/ptq"
dataset_location="/tf_dataset/tensorflow/mini-coco-100.record"
input_model="/tf_dataset/pre-train-model-oob/object_detection/ssd_mobilenet_v1"
new_benchmark=true
inc_new_api=true
tuning_cmd="bash run_quant.sh --dataset_location=${dataset_location} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --dataset_location=${dataset_location} --batch_size=1 --mode=performance"
elif [ "${model}" == "inception_v1" ]; then
model_src_dir="image_recognition/tensorflow_models/quantization/ptq"
dataset_location="/tf_dataset/dataset/TF_mini_imagenet"
input_model="/tf_dataset/pre-train-model-slim/pbfile/frozen_pb/frozen_inception_v1.pb"
yaml="inception_v1.yaml"
strategy="basic"
batch_size=1
new_benchmark=true
tuning_cmd="bash run_tuning.sh --config=${yaml} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --config=${yaml} --mode=performance"
elif [ "${model}" == "darknet19" ]; then
model_src_dir="oob_models/quantization/ptq"
dataset_location=""
input_model="/tf_dataset/tensorflow/tf_oob_models/ov/all_tf_models/PublicInHouse/classification/darknet19/darknet19.pb"
yaml="config.yaml"
strategy="basic"
batch_size=1
new_benchmark=false
inc_new_api=true
tuning_cmd="bash run_quant.sh --topology=${model} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --topology=${model} --mode=performance --batch_size=1 --iters=500"
elif [ "${model}" == "densenet-121" ]; then
model_src_dir="oob_models/quantization/ptq"
dataset_location=""
input_model="/tf_dataset/tensorflow/tf_oob_models/ov/all_tf_models/classification/densenet/121/tf/densenet-121.pb"
yaml="config.yaml"
strategy="basic"
batch_size=1
new_benchmark=false
inc_new_api=true
tuning_cmd="bash run_quant.sh --topology=${model} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --topology=${model} --mode=performance --batch_size=1 --iters=500"
elif [ "${model}" == "resnet-101" ]; then
model_src_dir="oob_models/quantization/ptq"
dataset_location=""
input_model="/tf_dataset/tensorflow/tf_oob_models/ov/all_tf_models/classification/resnet/v1/101/tf/resnet-101.pb"
yaml="config.yaml"
strategy="basic"
batch_size=1
new_benchmark=false
inc_new_api=true
tuning_cmd="bash run_quant.sh --topology=${model} --input_model=${input_model}"
benchmark_cmd="bash run_benchmark.sh --topology=${model} --mode=performance --batch_size=1 --iters=500"
fi
/bin/bash run_model_trigger_common.sh \
--yaml=${yaml} \
--framework=${FRAMEWORK} \
--fwk_ver=${FRAMEWORK_VERSION} \
--model=${model} \
--model_src_dir=${model_src_dir} \
--dataset_location=${dataset_location} \
--input_model=${input_model} \
--batch_size=${batch_size} \
--strategy=${strategy} \
--new_benchmark=${new_benchmark} \
--tuning_cmd="${tuning_cmd}" \
--benchmark_cmd="${benchmark_cmd}" \
--inc_new_api="${inc_new_api}" \
--mode=${mode} \
--USE_TUNE_ACC=${USE_TUNE_ACC} \
--PERF_STABLE_CHECK=${PERF_STABLE_CHECK} \
--BUILD_BUILDID=${BUILD_BUILDID}

View File

@@ -0,0 +1,30 @@
#!/bin/bash
set -eo pipefail
source /neural-compressor/.azure-pipelines/scripts/change_color.sh
# get parameters
PATTERN='[-a-zA-Z0-9_]*='
starttime=`date +'%Y-%m-%d %H:%M:%S'`
for i in "$@"
do
case $i in
--tuning_cmd=*)
tuning_cmd=`echo $i | sed "s/${PATTERN}//"`;;
--strategy=*)
strategy=`echo $i | sed "s/${PATTERN}//"`;;
*)
echo "Parameter $i not recognized."; exit 1;;
esac
done
eval "/usr/bin/time -v ${tuning_cmd}"
$BOLD_YELLOW && echo "====== finish tuning. echo information. ======" && $RESET
endtime=`date +'%Y-%m-%d %H:%M:%S'`
start_seconds=$(date --date="$starttime" +%s);
end_seconds=$(date --date="$endtime" +%s);
$BOLD_GREEN && echo "Tuning time spend: "$((end_seconds-start_seconds))"s " && $RESET
$BOLD_GREEN && echo "Tuning strategy: ${strategy}" && $RESET
$BOLD_GREEN && echo "Total resident size (kbytes): $(cat /proc/meminfo | grep 'MemTotal' | sed 's/[^0-9]//g')" && $RESET

View File

@@ -0,0 +1,322 @@
import argparse
import os
import platform
import re
from typing import Optional, Union
import psutil
system = platform.system()
try:
import ruamel.yaml as yaml
except:
import ruamel_yaml as yaml
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--yaml", type=str, required=True, help="Path to yaml config.")
parser.add_argument("--framework", type=str, required=True, help="Framework of model.")
parser.add_argument("--dataset_location", type=str, required=True, help="Location of dataset used for model.")
parser.add_argument("--strategy", type=str, required=False, help="Strategy to update.")
parser.add_argument("--batch_size", type=int, required=False, help="Batch size.")
parser.add_argument("--new_benchmark", type=str, required=False, help="Whether to modify benchmark config.")
parser.add_argument("--multi_instance", type=str, required=False, help="Whether to eval in multi-instance.")
return parser.parse_args()
def update_yaml_dataset(yaml, framework, dataset_location):
if not os.path.isfile(yaml):
raise Exception(f"Not found yaml config at '{yaml}' location.")
print("Reading config")
with open(yaml, "r") as config:
lines = config.readlines()
# Update dataset
if framework != "pytorch":
val_txt_location = os.path.dirname(dataset_location) + f"{os.path.sep}" + "val.txt"
patterns = {
"root_path": {
"pattern": r"root:.*/path/to/(calibration|evaluation)/dataset/?",
"replacement": f"root: {dataset_location}",
},
"data_path": {
"pattern": r"data_path:.*/path/to/(calibration|evaluation)/dataset/?",
"replacement": f"data_path: {dataset_location}",
},
"image_list": {
"pattern": r"image_list:.*/path/to/(calibration|evaluation)/label/?",
"replacement": f"image_list: {val_txt_location}",
},
"data_dir": {
"pattern": r"data_dir:.*/path/to/dataset/?",
"replacement": f"data_dir: {dataset_location}",
},
}
print("======= update_yaml_dataset =======")
with open(yaml, "w") as config:
for line in lines:
for key, key_patterns in patterns.items():
if re.search(key_patterns["pattern"], line):
print(f"Replacing {key} key.")
line = re.sub(key_patterns["pattern"], key_patterns["replacement"], line)
config.write(line)
else:
val_dataset = dataset_location + f"{os.path.sep}" + "val"
train_dataset = dataset_location + f"{os.path.sep}" + "train"
patterns = {
"calibration_dataset": {
"pattern": r"root:.*/path/to/calibration/dataset/?",
"replacement": f"root: {train_dataset}",
},
"evaluation_dataset": {
"pattern": r"root:.*/path/to/evaluation/dataset/?",
"replacement": f"root: {val_dataset}",
},
}
print("======= update_yaml_dataset =======")
with open(yaml, "w") as config:
for line in lines:
for key, key_patterns in patterns.items():
if re.search(key_patterns["pattern"], line):
print(f"Replacing {key} key.")
line = re.sub(key_patterns["pattern"], key_patterns["replacement"], line)
config.write(line)
def update_yaml_config_tuning(
yaml_file,
strategy=None,
mode=None,
batch_size=None,
iteration=None,
max_trials=None,
algorithm=None,
timeout=None,
strategy_token=None,
sampling_size=None,
dtype=None,
tf_new_api=None,
):
with open(yaml_file) as f:
yaml_config = yaml.round_trip_load(f, preserve_quotes=True)
if algorithm:
try:
model_wise = yaml_config.get("quantization", {}).get("model_wise", {})
prev_activation = model_wise.get("activation", {})
if not prev_activation:
model_wise.update({"activation": {}})
prev_activation = model_wise.get("activation", {})
prev_activation.update({"algorithm": algorithm})
except Exception as e:
print(f"[ WARNING ] {e}")
if timeout:
try:
exit_policy = yaml_config.get("tuning", {}).get("exit_policy", {})
prev_timeout = exit_policy.get("timeout", None)
exit_policy.update({"timeout": timeout})
print(f"Changed {prev_timeout} to {timeout}")
except Exception as e:
print(f"[ WARNING ] {e}")
if strategy and strategy != "basic": # Workaround for PyTorch huggingface models (`sed` in run_quant.sh)
try:
tuning_config = yaml_config.get("tuning", {})
prev_strategy = tuning_config.get("strategy", {})
if not prev_strategy:
tuning_config.update({"strategy": {}})
prev_strategy = tuning_config.get("strategy", {})
strategy_name = prev_strategy.get("name", None)
prev_strategy.update({"name": strategy})
if strategy == "sigopt":
prev_strategy.update(
{
"sigopt_api_token": strategy_token,
"sigopt_project_id": "lpot",
"sigopt_experiment_name": "lpot-tune",
}
)
if strategy == "hawq":
prev_strategy.update({"loss": "CrossEntropyLoss"})
print(f"Changed {strategy_name} to {strategy}")
except Exception as e:
print(f"[ WARNING ] {e}")
if max_trials and max_trials > 0:
try:
tuning_config = yaml_config.get("tuning", {})
prev_exit_policy = tuning_config.get("exit_policy", {})
if not prev_exit_policy:
tuning_config.update({"exit_policy": {"max_trials": max_trials}})
else:
prev_max_trials = prev_exit_policy.get("max_trials", None)
prev_exit_policy.update({"max_trials": max_trials})
print(f"Changed {prev_max_trials} to {max_trials}")
except Exception as e:
print(f"[ WARNING ] {e}")
if mode == "accuracy":
try:
# delete performance part in yaml if exist
performance = yaml_config.get("evaluation", {}).get("performance", {})
if performance:
yaml_config.get("evaluation", {}).pop("performance", {})
# accuracy batch_size replace
if batch_size:
try:
dataloader = yaml_config.get("evaluation", {}).get("accuracy", {}).get("dataloader", {})
prev_batch_size = dataloader.get("batch_size", None)
dataloader.update({"batch_size": batch_size})
print(f"Changed accuracy batch size from {prev_batch_size} to {batch_size}")
except Exception as e:
print(f"[ WARNING ] {e}")
except Exception as e:
print(f"[ WARNING ] {e}")
elif mode:
try:
# delete accuracy part in yaml if exist
accuracy = yaml_config.get("evaluation", {}).get("accuracy", {})
if accuracy:
yaml_config.get("evaluation", {}).pop("accuracy", {})
# performance iteration replace
if iteration:
try:
performance = yaml_config.get("evaluation", {}).get("performance", {})
prev_iteration = performance.get("iteration", None)
performance.update({"iteration": iteration})
print(f"Changed performance batch size from {prev_iteration} to {iteration}")
except Exception as e:
print(f"[ WARNING ] {e}")
if batch_size and mode == "latency":
try:
dataloader = yaml_config.get("evaluation", {}).get("performance", {}).get("dataloader", {})
prev_batch_size = dataloader.get("batch_size", None)
dataloader.update({"batch_size": batch_size})
print(f"Changed accuracy batch size from {prev_batch_size} to {batch_size}")
except Exception as e:
print(f"[ WARNING ] {e}")
except Exception as e:
print(f"[ WARNING ] {e}")
if sampling_size:
try:
calibration = yaml_config.get("quantization", {}).get("calibration", {})
prev_sampling_size = calibration.get("sampling_size", None)
calibration.update({"sampling_size": sampling_size})
print(f"Changed calibration sampling size from {prev_sampling_size} to {sampling_size}")
except Exception as e:
print(f"[ WARNING ] {e}")
if dtype:
try:
quantization = yaml_config.get("quantization", {})
prev_dtype = quantization.get("dtype", None)
quantization.update({"dtype": dtype})
print(f"Changed dtype from {prev_dtype} to {dtype}")
except Exception as e:
print(f"[ WARNING ] {e}")
if tf_new_api == "true":
try:
model = yaml_config.get("model", {})
prev_framework = model.get("framework", None)
model.update({"framework": "inteltensorflow"})
print(f"Changed framework from {prev_framework} to inteltensorflow")
except Exception as e:
print(f"[ WARNING ] {e}")
print("====== update_yaml_config_tuning ========")
yaml_content = yaml.round_trip_dump(yaml_config)
with open(yaml_file, "w") as output_file:
output_file.write(yaml_content)
def update_yaml_config_benchmark_acc(yaml_path: str, batch_size=None):
with open(yaml_path) as f:
yaml_config = yaml.round_trip_load(f, preserve_quotes=True)
try:
accuracy = yaml_config.get("evaluation", {}).get("accuracy", {})
if not accuracy:
raise AttributeError
dataloader = accuracy.get("dataloader", {})
if dataloader:
dataloader.update({"batch_size": batch_size})
configs = accuracy.get("configs", {})
if configs:
del accuracy["configs"]
except Exception as e:
print(f"[ WARNING ] {e}")
print("====== update_yaml_config_benchmark_acc ========")
yaml_content = yaml.round_trip_dump(yaml_config)
with open(yaml_path, "w") as output_file:
output_file.write(yaml_content)
def update_yaml_config_benchmark_perf(yaml_path: str, batch_size=None, multi_instance=None):
# Get cpu information for multi-instance
total_cores = psutil.cpu_count(logical=False)
total_sockets = 1
ncores_per_socket = total_cores / total_sockets
ncores_per_instance = ncores_per_socket
iters = 100
if multi_instance == "true":
ncores_per_instance = 4
iters = 500
with open(yaml_path) as f:
yaml_config = yaml.round_trip_load(f, preserve_quotes=True)
try:
performance = yaml_config.get("evaluation", {}).get("performance", {})
if not performance:
raise AttributeError
dataloader = performance.get("dataloader", {})
if dataloader:
dataloader.update({"batch_size": batch_size})
performance.update({"iteration": iters})
configs = performance.get("configs", {})
if not configs:
raise AttributeError
else:
configs.update(
{
"cores_per_instance": int(ncores_per_instance),
"num_of_instance": int(ncores_per_socket // ncores_per_instance),
}
)
for attr in ["intra_num_of_threads", "inter_num_of_threads", "kmp_blocktime"]:
if configs.get(attr):
del configs[attr]
print(configs)
except Exception as e:
print(f"[ WARNING ] {e}")
print("====== update_yaml_config_benchmark_perf ========")
yaml_content = yaml.round_trip_dump(yaml_config)
with open(yaml_path, "w") as output_file:
output_file.write(yaml_content)
if __name__ == "__main__":
args = parse_args()
update_yaml_dataset(args.yaml, args.framework, args.dataset_location)
update_yaml_config_tuning(args.yaml, strategy=args.strategy)
print("===== multi_instance={} ====".format(args.multi_instance))
if args.new_benchmark == "true":
update_yaml_config_benchmark_acc(args.yaml, batch_size=args.batch_size)
update_yaml_config_benchmark_perf(args.yaml, batch_size=args.batch_size, multi_instance=args.multi_instance)

View File

@@ -0,0 +1,134 @@
source /neural-compressor/.azure-pipelines/scripts/change_color.sh
set -e
pip install coverage
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.${1}
coverage_log="/neural-compressor/log_dir/coverage_log"
coverage_log_base="/neural-compressor/log_dir/coverage_log_base"
coverage_compare="/neural-compressor/log_dir/coverage_compare.html"
cd /neural-compressor/log_dir
$BOLD_YELLOW && echo "collect coverage for PR branch" && $RESET
cp ut_3x_coverage/.coverage /neural-compressor/
mkdir -p coverage_PR
cd /neural-compressor
coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log}
coverage html -d log_dir/coverage_PR/htmlcov --rcfile=${COVERAGE_RCFILE}
coverage xml -o log_dir/coverage_PR/coverage.xml --rcfile=${COVERAGE_RCFILE}
ls -l log_dir/coverage_PR/htmlcov
$BOLD_YELLOW && echo "collect coverage for baseline" && $RESET
cd /neural-compressor
cp -r /neural-compressor/.azure-pipelines .azure-pipelines-pr
git config --global --add safe.directory /neural-compressor
git fetch
git checkout master
rm -rf build dist *egg-info
binary_index="${1%_fp8}"
echo y | pip uninstall neural_compressor_${binary_index}
cd /neural-compressor/.azure-pipelines-pr/scripts && bash install_nc.sh ${1}
coverage erase
cd /neural-compressor/log_dir
mkdir -p coverage_base
rm -rf /neural-compressor/.coverage || true
cp ut_3x_baseline_coverage/.coverage /neural-compressor
cd /neural-compressor
coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log_base}
coverage html -d log_dir/coverage_base/htmlcov --rcfile=${COVERAGE_RCFILE}
coverage xml -o log_dir/coverage_base/coverage.xml --rcfile=${COVERAGE_RCFILE}
ls -l log_dir/coverage_base/htmlcov
get_coverage_data() {
# Input argument
local coverage_xml="$1"
# Get coverage data
local coverage_data=$(python3 -c "import xml.etree.ElementTree as ET; root = ET.parse('$coverage_xml').getroot(); print(ET.tostring(root).decode())")
if [[ -z "$coverage_data" ]]; then
echo "Failed to get coverage data from $coverage_xml."
exit 1
fi
# Get lines coverage
local lines_covered=$(echo "$coverage_data" | grep -o 'lines-covered="[0-9]*"' | cut -d '"' -f 2)
local lines_valid=$(echo "$coverage_data" | grep -o 'lines-valid="[0-9]*"' | cut -d '"' -f 2)
if [ $lines_valid == 0 ]; then
local lines_coverage=0
else
local lines_coverage=$(awk "BEGIN {printf \"%.3f\", 100 * $lines_covered / $lines_valid}")
fi
# Get branches coverage
local branches_covered=$(echo "$coverage_data" | grep -o 'branches-covered="[0-9]*"' | cut -d '"' -f 2)
local branches_valid=$(echo "$coverage_data" | grep -o 'branches-valid="[0-9]*"' | cut -d '"' -f 2)
if [ $branches_valid == 0 ]; then
local branches_coverage=0
else
local branches_coverage=$(awk "BEGIN {printf \"%.3f\", 100 * $branches_covered/$branches_valid}")
fi
# Return values
echo "$lines_covered $lines_valid $lines_coverage $branches_covered $branches_valid $branches_coverage"
}
$BOLD_YELLOW && echo "compare coverage" && $RESET
coverage_PR_xml="log_dir/coverage_PR/coverage.xml"
coverage_PR_data=$(get_coverage_data $coverage_PR_xml)
read lines_PR_covered lines_PR_valid coverage_PR_lines_rate branches_PR_covered branches_PR_valid coverage_PR_branches_rate <<<"$coverage_PR_data"
coverage_base_xml="log_dir/coverage_base/coverage.xml"
coverage_base_data=$(get_coverage_data $coverage_base_xml)
read lines_base_covered lines_base_valid coverage_base_lines_rate branches_base_covered branches_base_valid coverage_base_branches_rate <<<"$coverage_base_data"
$BOLD_BLUE && echo "PR lines coverage: $lines_PR_covered/$lines_PR_valid ($coverage_PR_lines_rate%)" && $RESET
$BOLD_BLUE && echo "PR branches coverage: $branches_PR_covered/$branches_PR_valid ($coverage_PR_branches_rate%)" && $RESET
$BOLD_BLUE && echo "BASE lines coverage: $lines_base_covered/$lines_base_valid ($coverage_base_lines_rate%)" && $RESET
$BOLD_BLUE && echo "BASE branches coverage: $branches_base_covered/$branches_base_valid ($coverage_base_branches_rate%)" && $RESET
$BOLD_YELLOW && echo "clear upload path" && $RESET
rm -fr log_dir/coverage_PR/.coverage*
rm -fr log_dir/coverage_base/.coverage*
rm -fr log_dir/ut-coverage-*
# Declare an array to hold failed items
declare -a fail_items=()
if (( $(bc -l <<< "${coverage_PR_lines_rate}+0.05 < ${coverage_base_lines_rate}") )); then
fail_items+=("lines")
fi
if (( $(bc -l <<< "${coverage_PR_branches_rate}+0.05 < ${coverage_base_branches_rate}") )); then
fail_items+=("branches")
fi
if [[ ${#fail_items[@]} -ne 0 ]]; then
fail_items_str=$(
IFS=', '
echo "${fail_items[*]}"
)
for item in "${fail_items[@]}"; do
case "$item" in
lines)
decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_lines_rate - $coverage_base_lines_rate" | bc -l)))
;;
branches)
decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_branches_rate - $coverage_base_branches_rate" | bc -l)))
;;
*)
echo "Unknown item: $item"
continue
;;
esac
$BOLD_RED && echo "Unit Test failed with ${item} coverage decrease ${decrease}%" && $RESET
done
$BOLD_RED && echo "compare coverage to give detail info" && $RESET
bash /neural-compressor/.azure-pipelines-pr/scripts/ut/compare_coverage.sh ${coverage_compare} ${coverage_log} ${coverage_log_base} "FAILED" ${coverage_PR_lines_rate} ${coverage_base_lines_rate} ${coverage_PR_branches_rate} ${coverage_base_branches_rate}
exit 1
else
$BOLD_GREEN && echo "Unit Test success with coverage lines: ${coverage_PR_lines_rate}%, branches: ${coverage_PR_branches_rate}%" && $RESET
$BOLD_GREEN && echo "compare coverage to give detail info" && $RESET
bash /neural-compressor/.azure-pipelines-pr/scripts/ut/compare_coverage.sh ${coverage_compare} ${coverage_log} ${coverage_log_base} "SUCCESS" ${coverage_PR_lines_rate} ${coverage_base_lines_rate} ${coverage_PR_branches_rate} ${coverage_base_branches_rate}
fi

View File

@@ -0,0 +1,19 @@
[run]
branch = True
[report]
include =
*/neural_compressor/common/*
*/neural_compressor/torch/*
omit =
*/neural_compressor/torch/algorithms/fp8_quant/*
*/neural_compressor/torch/algorithms/mixed_low_precision/*
*/neural_compressor/torch/amp/*
exclude_lines =
pragma: no cover
raise NotImplementedError
raise TypeError
if self.device == "gpu":
if device == "gpu":
except ImportError:
except Exception as e:

View File

@@ -0,0 +1,15 @@
[run]
branch = True
[report]
include =
*/neural_compressor/torch/algorithms/fp8_quant/*
*/neural_compressor/torch/algorithms/mixed_low_precision/*
exclude_lines =
pragma: no cover
raise NotImplementedError
raise TypeError
if self.device == "gpu":
if device == "gpu":
except ImportError:
except Exception as e:

View File

@@ -0,0 +1,15 @@
[run]
branch = True
[report]
include =
*/neural_compressor/common/*
*/neural_compressor/tensorflow/*
exclude_lines =
pragma: no cover
raise NotImplementedError
raise TypeError
if self.device == "gpu":
if device == "gpu":
except ImportError:
except Exception as e:

View File

@@ -0,0 +1,47 @@
#!/bin/bash
python -c "import neural_compressor as nc"
test_case="run 3x Torch"
echo "${test_case}"
echo "##[section]Run import check"
set -e
python -c "import neural_compressor.torch"
python -c "import neural_compressor.common"
echo "##[section]import check pass"
# install requirements
echo "##[group]set up UT env..."
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
pip install -r /neural-compressor/test/3x/torch/requirements.txt
pip install pytest-cov
pip install pytest-html
echo "##[endgroup]"
pip list
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.3x_pt
inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])')
cd /neural-compressor/test/3x || exit 1
rm -rf tensorflow
rm -rf torch/algorithms/fp8_quant
rm -rf torch/quantization/fp8_quant
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_3x_pt.log
find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${inc_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh
cat run.sh
bash run.sh 2>&1 | tee ${ut_log_name}
cp report.html ${LOG_DIR}/
if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
echo "Find errors in pytest case, please check the output..."
echo "Please search for '== FAILURES ==' or '== ERRORS =='"
exit 1
fi
# if ut pass, collect the coverage file into artifacts
cp .coverage ${LOG_DIR}/.coverage
echo "UT finished successfully! "

View File

@@ -0,0 +1,63 @@
#!/bin/bash
python -c "import neural_compressor as nc"
test_case="run 3x Torch Habana FP8"
echo "${test_case}"
echo "##[section]Run import check"
set -e
python -c "import neural_compressor.torch"
python -c "import neural_compressor.common"
echo "##[section]import check pass"
# install requirements
echo "##[group]set up UT env..."
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
sed -i '/^intel_extension_for_pytorch/d' /neural-compressor/test/3x/torch/requirements.txt
sed -i '/^auto_round/d' /neural-compressor/test/3x/torch/requirements.txt
cat /neural-compressor/test/3x/torch/requirements.txt
pip install -r /neural-compressor/test/3x/torch/requirements.txt
pip install pytest-cov
pip install pytest-html
pip install pytest-html-merger
echo "##[endgroup]"
pip list
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8
inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])')
cd /neural-compressor/test/3x || exit 1
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_3x_pt_fp8.log
pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only/test_load.py 2>&1 | tee -a ${ut_log_name}
pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name}
# pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/weight_only/test_autoround.py 2>&1 | tee -a ${ut_log_name}
# Below folder contains some special configuration for pytest so we need to enter the path and run it separately
cd /neural-compressor/test/3x/torch/algorithms/fp8_quant
pytest --cov="${inc_path}" -vs --disable-warnings --html=report_4.html --self-contained-html . 2>&1 | tee -a ${ut_log_name}
cp .coverage ${LOG_DIR}/.coverage.algo_fp8
cd - && mv /neural-compressor/test/3x/torch/algorithms/fp8_quant/*.html .
# Below folder contains some special configuration for pytest so we need to enter the path and run it separately
cd /neural-compressor/test/3x/torch/quantization/fp8_quant
pytest --cov="${inc_path}" -vs --disable-warnings --html=report_5.html --self-contained-html . 2>&1 | tee -a ${ut_log_name}
cp .coverage ${LOG_DIR}/.coverage.quant_fp8
cd - && mv /neural-compressor/test/3x/torch/quantization/fp8_quant/*.html .
mkdir -p report && mv *.html report
pytest_html_merger -i ./report -o ./report.html
cp report.html ${LOG_DIR}/
if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
echo "Find errors in pytest case, please check the output..."
echo "Please search for '== FAILURES ==' or '== ERRORS =='"
exit 1
fi
# if ut pass, collect the coverage file into artifacts
cp .coverage ${LOG_DIR}/.coverage
cd ${LOG_DIR}
coverage combine .coverage.*
echo "UT finished successfully! "

View File

@@ -0,0 +1,76 @@
#!/bin/bash
python -c "import neural_compressor as nc"
test_case="run 3x TensorFlow"
echo "${test_case}"
echo "##[section]Run import check"
set -e
python -c "import neural_compressor.tensorflow"
python -c "import neural_compressor.common"
echo "##[section]import check pass"
# install requirements
echo "##[group]set up UT env..."
pip install -r /neural-compressor/test/3x/tensorflow/requirements.txt
pip install pytest-cov
pip install pytest-html
pip install pytest-html-merger
echo "##[endgroup]"
pip list
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.3x_tf
inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])')
cd /neural-compressor/test/3x || exit 1
rm -rf torch
rm -rf onnxrt
mv tensorflow/keras ../3x_keras
mv tensorflow/quantization/ptq/newapi ../3x_newapi
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_3x_tf.log
# test for tensorflow ut
pytest --cov="${inc_path}" -vs --disable-warnings --html=report_tf_quant.html --self-contained-html ./tensorflow/quantization 2>&1 | tee -a ${ut_log_name}
rm -rf tensorflow/quantization
pytest --cov="${inc_path}" --cov-append -vs --disable-warnings --html=report_tf_test_quantize_model.html --self-contained-html ./tensorflow/test_quantize_model.py 2>&1 | tee -a ${ut_log_name}
rm -rf tensorflow/test_quantize_model.py
pytest --cov="${inc_path}" --cov-append -vs --disable-warnings --html=report_tf.html --self-contained-html . 2>&1 | tee -a ${ut_log_name}
# test for tensorflow new api ut
pip uninstall tensorflow -y
pip install /tf_dataset/tf_binary/230928/tensorflow*.whl
pip install cmake
pip install protobuf==3.20.3
pip install horovod==0.27.0
pip list
rm -rf tensorflow/*
mkdir -p tensorflow/quantization/ptq
mv ../3x_newapi tensorflow/quantization/ptq/newapi
find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=${inc_path} --cov-append -vs --disable-warnings ,g" > run.sh
cat run.sh
bash run.sh 2>&1 | tee -a ${ut_log_name}
# test for itex ut
rm -rf tensorflow/*
mv ../3x_keras tensorflow/keras
pip uninstall tensorflow -y
pip install intel-extension-for-tensorflow[cpu]
pytest --cov="${inc_path}" --cov-append -vs --disable-warnings --html=report_keras.html --self-contained-html ./tensorflow 2>&1 | tee -a ${ut_log_name}
mkdir -p report
mv *.html report
pytest_html_merger -i ./report -o ./report.html
cp report.html ${LOG_DIR}/
if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
echo "Find errors in pytest case, please check the output..."
echo "Please search for '== FAILURES ==' or '== ERRORS =='"
exit 1
fi
# if ut pass, collect the coverage file into artifacts
cp .coverage ${LOG_DIR}/.coverage
echo "UT finished successfully! "

View File

@@ -0,0 +1,139 @@
source /neural-compressor/.azure-pipelines/scripts/change_color.sh
pip install coverage
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file
coverage_log="/neural-compressor/log_dir/coverage_log"
coverage_log_base="/neural-compressor/log_dir/coverage_log_base"
coverage_compare="/neural-compressor/log_dir/coverage_compare.html"
cd /neural-compressor/log_dir
$BOLD_YELLOW && echo "##[group]collect coverage for PR branch" && $RESET
mkdir -p coverage_PR
cp ut_*_coverage/.coverage.* ./coverage_PR/
cd coverage_PR
coverage combine --keep --rcfile=${COVERAGE_RCFILE}
cp .coverage /neural-compressor/.coverage
cd /neural-compressor
coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log}
coverage html -d log_dir/coverage_PR/htmlcov --rcfile=${COVERAGE_RCFILE}
coverage xml -o log_dir/coverage_PR/coverage.xml --rcfile=${COVERAGE_RCFILE}
ls -l log_dir/coverage_PR/htmlcov
cd /neural-compressor
cp -r /neural-compressor/.azure-pipelines .azure-pipelines-pr
git config --global --add safe.directory /neural-compressor
git fetch
git checkout master
rm -rf build dist *egg-info
echo y | pip uninstall neural-compressor
cd /neural-compressor/.azure-pipelines-pr/scripts && bash install_nc.sh
echo "##[endgroup]"
$BOLD_YELLOW && echo "##[group]collect coverage for baseline" && $RESET
coverage erase
cd /neural-compressor/log_dir
mkdir -p coverage_base
cp ut-base_*_coverage/.coverage.* ./coverage_base/
cd coverage_base
coverage combine --keep --rcfile=${COVERAGE_RCFILE}
cp .coverage /neural-compressor/.coverage
cd /neural-compressor
coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log_base}
coverage html -d log_dir/coverage_base/htmlcov --rcfile=${COVERAGE_RCFILE}
coverage xml -o log_dir/coverage_base/coverage.xml --rcfile=${COVERAGE_RCFILE}
ls -l log_dir/coverage_base/htmlcov
echo "##[endgroup]"
get_coverage_data() {
# Input argument
local coverage_xml="$1"
# Get coverage data
local coverage_data=$(python3 -c "import xml.etree.ElementTree as ET; root = ET.parse('$coverage_xml').getroot(); print(ET.tostring(root).decode())")
if [[ -z "$coverage_data" ]]; then
echo "Failed to get coverage data from $coverage_xml."
exit 1
fi
# Get lines coverage
local lines_covered=$(echo "$coverage_data" | grep -o 'lines-covered="[0-9]*"' | cut -d '"' -f 2)
local lines_valid=$(echo "$coverage_data" | grep -o 'lines-valid="[0-9]*"' | cut -d '"' -f 2)
if [ $lines_valid == 0 ]; then
local lines_coverage=0
else
local lines_coverage=$(awk "BEGIN {printf \"%.3f\", 100 * $lines_covered / $lines_valid}")
fi
# Get branches coverage
local branches_covered=$(echo "$coverage_data" | grep -o 'branches-covered="[0-9]*"' | cut -d '"' -f 2)
local branches_valid=$(echo "$coverage_data" | grep -o 'branches-valid="[0-9]*"' | cut -d '"' -f 2)
if [ $branches_valid == 0 ]; then
local branches_coverage=0
else
local branches_coverage=$(awk "BEGIN {printf \"%.3f\", 100 * $branches_covered/$branches_valid}")
fi
# Return values
echo "$lines_covered $lines_valid $lines_coverage $branches_covered $branches_valid $branches_coverage"
}
$BOLD_YELLOW && echo "compare coverage" && $RESET
coverage_PR_xml="log_dir/coverage_PR/coverage.xml"
coverage_PR_data=$(get_coverage_data $coverage_PR_xml)
read lines_PR_covered lines_PR_valid coverage_PR_lines_rate branches_PR_covered branches_PR_valid coverage_PR_branches_rate <<<"$coverage_PR_data"
coverage_base_xml="log_dir/coverage_base/coverage.xml"
coverage_base_data=$(get_coverage_data $coverage_base_xml)
read lines_base_covered lines_base_valid coverage_base_lines_rate branches_base_covered branches_base_valid coverage_base_branches_rate <<<"$coverage_base_data"
$BOLD_BLUE && echo "PR lines coverage: $lines_PR_covered/$lines_PR_valid ($coverage_PR_lines_rate%)" && $RESET
$BOLD_BLUE && echo "PR branches coverage: $branches_PR_covered/$branches_PR_valid ($coverage_PR_branches_rate%)" && $RESET
$BOLD_BLUE && echo "BASE lines coverage: $lines_base_covered/$lines_base_valid ($coverage_base_lines_rate%)" && $RESET
$BOLD_BLUE && echo "BASE branches coverage: $branches_base_covered/$branches_base_valid ($coverage_base_branches_rate%)" && $RESET
$BOLD_YELLOW && echo "clear upload path" && $RESET
rm -fr log_dir/coverage_PR/.coverage*
rm -fr log_dir/coverage_base/.coverage*
rm -fr log_dir/ut-coverage-*
# Declare an array to hold failed items
declare -a fail_items=()
if (( $(bc -l <<< "${coverage_PR_lines_rate}+0.05 < ${coverage_base_lines_rate}") )); then
fail_items+=("lines")
fi
if (( $(bc -l <<< "${coverage_PR_branches_rate}+0.05 < ${coverage_base_branches_rate}") )); then
fail_items+=("branches")
fi
if [[ ${#fail_items[@]} -ne 0 ]]; then
fail_items_str=$(
IFS=', '
echo "${fail_items[*]}"
)
for item in "${fail_items[@]}"; do
case "$item" in
lines)
decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_lines_rate - $coverage_base_lines_rate" | bc -l)))
;;
branches)
decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_branches_rate - $coverage_base_branches_rate" | bc -l)))
;;
*)
echo "Unknown item: $item"
continue
;;
esac
$BOLD_RED && echo "Unit Test failed with ${item} coverage decrease ${decrease}%" && $RESET
done
$BOLD_RED && echo "compare coverage to give detail info" && $RESET
bash /neural-compressor/.azure-pipelines-pr/scripts/ut/compare_coverage.sh ${coverage_compare} ${coverage_log} ${coverage_log_base} "FAILED" ${coverage_PR_lines_rate} ${coverage_base_lines_rate} ${coverage_PR_branches_rate} ${coverage_base_branches_rate}
exit 1
else
$BOLD_GREEN && echo "Unit Test success with coverage lines: ${coverage_PR_lines_rate}%, branches: ${coverage_PR_branches_rate}%" && $RESET
$BOLD_GREEN && echo "compare coverage to give detail info" && $RESET
bash /neural-compressor/.azure-pipelines-pr/scripts/ut/compare_coverage.sh ${coverage_compare} ${coverage_log} ${coverage_log_base} "SUCCESS" ${coverage_PR_lines_rate} ${coverage_base_lines_rate} ${coverage_PR_branches_rate} ${coverage_base_branches_rate}
fi

View File

@@ -0,0 +1,225 @@
output_file=$1
coverage_pr_log=$2
coverage_base_log=$3
coverage_status=$4
coverage_PR_lines_rate=$5
coverage_base_lines_rate=$6
coverage_PR_branches_rate=$7
coverage_base_branches_rate=$8
module_name="neural_compressor"
[[ ! -f $coverage_pr_log ]] && exit 1
[[ ! -f $coverage_base_log ]] && exit 1
file_name="./coverage_compare"
sed -i "s|\/usr.*${module_name}\/||g" $coverage_pr_log
sed -i "s|\/usr.*${module_name}\/||g" $coverage_base_log
diff $coverage_pr_log $coverage_base_log >diff_file
[[ $? == 0 ]] && exit 0
grep -Po "[<,>,\d].*" diff_file | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t" $7}' | sed "/Name/d" | sed "/TOTAL/d" | sed "/---/d" >$file_name
[[ ! -s $file_name ]] && exit 0
[[ -f $output_file ]] && rm -f $output_file
touch $output_file
function generate_html_head {
cat >${output_file} <<eof
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>UT coverage</title>
<style type="text/css">
body {
margin: 0;
padding: 0;
background: white no-repeat left top;
}
.main {
margin: 20px auto 10px auto;
background: white;
border-radius: 8px;
-moz-border-radius: 8px;
-webkit-border-radius: 8px;
padding: 0 30px 30px 30px;
border: 1px solid #adaa9f;
box-shadow: 0 2px 2px #9c9c9c;
-moz-box-shadow: 0 2px 2px #9c9c9c;
-webkit-box-shadow: 0 2px 2px #9c9c9c;
}
.features-table {
width: 100%;
margin: 0 auto;
border-collapse: separate;
border-spacing: 0;
text-shadow: 0 1px 0 #fff;
color: #2a2a2a;
background: #fafafa;
background-image: -moz-linear-gradient(top, #fff, #eaeaea, #fff);
/* Firefox 3.6 */
background-image: -webkit-gradient(linear, center bottom, center top, from(#fff), color-stop(0.5, #eaeaea), to(#fff));
font-family: Verdana, Arial, Helvetica
}
.features-table th,
td {
text-align: center;
height: 25px;
line-height: 25px;
padding: 0 8px;
border: 1px solid #cdcdcd;
box-shadow: 0 1px 0 white;
-moz-box-shadow: 0 1px 0 white;
-webkit-box-shadow: 0 1px 0 white;
white-space: nowrap;
}
</style>
</head>
eof
}
function extract_diff_data() {
local file_name=$1 diff_file=$2 reg=$3
local file=$(cat $file_name | grep "${diff_file}" | grep -v ".*/${diff_file}" | grep -Po "${reg}.*" | sed "s/${reg}[ \t]*//g" | awk '{print $1}')
local stmts=$(cat $file_name | grep "${diff_file}" | grep -v ".*/${diff_file}" | grep -Po "${reg}.*" | sed "s/${reg}[ \t]*//g" | awk '{print $2}')
local miss=$(cat $file_name | grep "${diff_file}" | grep -v ".*/${diff_file}" | grep -Po "${reg}.*" | sed "s/${reg}[ \t]*//g" | awk '{print $3}')
local cover=$(cat $file_name | grep "${diff_file}" | grep -v ".*/${diff_file}" | grep -Po "${reg}.*" | sed "s/${reg}[ \t]*//g" | awk '{print $6}')
local branch=$(cat $file_name | grep "${diff_file}" | grep -v ".*/${diff_file}" | grep -Po "${reg}.*" | sed "s/${reg}[ \t]*//g" | awk '{print $4}')
echo "$file $stmts $miss $cover $branch"
}
function write_compare_details() {
local file=$1 stmts1=$2 miss1=$3 branch1=$4 cover1=$5 stmts2=$6 miss2=$7 branch2=$8 cover2=$9
echo """
<tr>
<td>PR | BASE</td>
<td style=\"text-align:left\">${file}</td>
<td style=\"text-align:left\">${stmts1} | ${stmts2}</td>
<td style=\"text-align:left\">${miss1} | ${miss2}</td>
<td style=\"text-align:left\">${branch1} | ${branch2}</td>
<td style=\"text-align:left\">${cover1} | ${cover2}</td>
</tr>
""" >>${output_file}
}
function get_color() {
local decrease=$1
if (($(echo "$decrease < 0" | bc -l))); then
local color="#FFD2D2"
else
local color="#90EE90"
fi
echo "$color"
}
function generate_coverage_summary() {
# generate table head
local Lines_cover_decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_lines_rate - $coverage_base_lines_rate" | bc -l)))
local Branches_cover_decrease=$(echo $(printf "%.3f" $(echo "$coverage_PR_branches_rate - $coverage_base_branches_rate" | bc -l)))
read lines_coverage_color <<<"$(get_color ${Lines_cover_decrease})"
read branches_coverage_color <<<"$(get_color ${Branches_cover_decrease})"
echo """
<body>
<div class="main">
<h1 align="center">Coverage Summary : ${coverage_status}</h1>
<table class=\"features-table\" style=\"width: 60%;margin-left:auto;margin-right:auto;empty-cells: hide\">
<tr>
<th></th>
<th>Base coverage</th>
<th>PR coverage</th>
<th>Diff</th>
</tr>
<tr>
<td> Lines </td>
<td> ${coverage_base_lines_rate}% </td>
<td> ${coverage_PR_lines_rate}% </td>
<td style=\"background-color:${lines_coverage_color}\"> ${Lines_cover_decrease}% </td>
</tr>
<tr>
<td> Branches </td>
<td> ${coverage_base_branches_rate}% </td>
<td> ${coverage_PR_branches_rate}% </td>
<td style=\"background-color:${branches_coverage_color}\"> ${Branches_cover_decrease}% </td>
</tr>
</table>
</div>
""" >>${output_file}
}
function generate_coverage_details() {
echo """
<div class="main">
<h2 align="center">Coverage Detail</h2>
<table class=\"features-table\" style=\"width: 60%;margin-left:auto;margin-right:auto;empty-cells: hide\">
<tr>
<th>Commit</th>
<th>FileName</th>
<th>Stmts</th>
<th>Miss</th>
<th>Branch</th>
<th>Cover</th>
</tr>
""" >>${output_file}
# generate compare detail
cat ${file_name} | while read line; do
if [[ $(echo $line | grep "[0-9]a[0-9]") ]] && [[ $(grep -A 1 "$line" ${file_name} | grep ">") ]]; then
diff_lines=$(sed -n "/${line}/,/^[0-9]/p" ${file_name} | grep ">")
diff_file_name=$(sed -n "/${line}/,/^[0-9]/p" ${file_name} | grep -Po ">.*[a-z,A-Z].*.py" | sed "s|>||g")
for diff_file in ${diff_file_name}; do
diff_file=$(echo "${diff_file}" | sed 's/[ \t]*//g')
diff_coverage_data=$(extract_diff_data ${file_name} ${diff_file} ">")
read file stmts miss cover branch <<<"$diff_coverage_data"
write_compare_details $file "NA" "NA" "NA" "NA" $stmts $miss $branch $cover
done
elif [[ $(echo $line | grep "[0-9]c[0-9]") ]] && [[ $(cat ${file_name} | grep -A 1 "$line" | grep "<") ]]; then
diff_lines=$(sed -n "/${line}/,/^[0-9]/p" ${file_name} | grep "<")
diff_file_name=$(sed -n "/${line}/,/^[0-9]/p" ${file_name} | grep -Po "<.*[a-z,A-Z].*.py" | sed "s|<||g")
for diff_file in ${diff_file_name}; do
diff_file=$(echo "${diff_file}" | sed 's/[ \t]*//g')
diff_coverage_data1=$(extract_diff_data ${file_name} ${diff_file} "<")
read file1 stmts1 miss1 cover1 branch1 <<<"$diff_coverage_data1"
diff_coverage_data2=$(extract_diff_data ${file_name} ${diff_file} ">")
read file2 stmts2 miss2 cover2 branch2 <<<"$diff_coverage_data2"
write_compare_details $file1 $stmts1 $miss1 $branch1 $cover1 $stmts2 $miss2 $branch2 $cover2
done
elif [[ $(echo $line | grep "[0-9]d[0-9]") ]] && [[ $(cat ${file_name} | grep -A 1 "$line" | grep "<") ]]; then
diff_lines=$(sed -n "/${line}/,/^[0-9]/p" ${file_name} | grep "<")
diff_file_name=$(sed -n "/${line}/,/^[0-9]/p" ${file_name} | grep -Po "<.*[a-z,A-Z].*.py" | sed "s|<||g")
for diff_file in ${diff_file_name}; do
diff_file=$(echo "${diff_file}" | sed 's/[ \t]*//g')
diff_coverage_data=$(extract_diff_data ${file_name} ${diff_file} "<")
read file stmts miss cover branch <<<"$diff_coverage_data"
write_compare_details $file $stmts $miss $branch $cover "NA" "NA" "NA" "NA"
done
fi
done
# generate table end
echo """
</table>
</div>
</body>
</html>""" >>${output_file}
}
function main {
generate_html_head
generate_coverage_summary
if [[ ${coverage_status} = "SUCCESS" ]]; then
echo """</body></html>""" >>${output_file}
echo "coverage PASS, no need to compare difference"
exit 0
else
generate_coverage_details
fi
}
main

View File

@@ -0,0 +1,30 @@
[run]
branch = True
[report]
omit =
*/**/fake*yaml
*/**/fake.py
*/neural_compressor/model/nets_factory.py
*/neural_compressor/benchmark.py
*/neural_compressor/experimental/benchmark.py
*/neural_compressor/contrib/strategy/tpe.py
*/intel_extension_for_transformers/backends/*
*/intel_extension_for_transformers/optimization/utils/get_throughput.py
*/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_decomposed_in.py
*/neural_compressor/adaptor/tf_utils/quantize_graph/qdq/fuse_qdq_in.py
*/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/freeze_value.py
*/neural_compressor/template/*
*/neural_compressor/common/*
*/neural_compressor/torch/*
*/neural_compressor/tensorflow/*
exclude_lines =
pragma: no cover
raise NotImplementedError
raise TypeError
if self.device == "gpu":
if device == "gpu":
except ImportError:
except Exception as e:
onnx_version < ONNX18_VERSION
onnx_version >= ONNX18_VERSION

View File

@@ -0,0 +1,116 @@
#!/bin/bash
set -x
echo "copy pre-train model..."
mkdir -p /tmp/.neural_compressor/inc_ut || true
cp -r /tf_dataset/ut-localfile/resnet_v2 /tmp/.neural_compressor/inc_ut || true
mkdir -p ~/.keras/datasets || true
cp -r /tf_dataset/ut-localfile/cifar-10-batches-py* ~/.keras/datasets || true
ll ~/.keras/datasets
echo "install dependencies..."
echo "tensorflow version is $tensorflow_version"
echo "itex version is $itex_version"
echo "pytorch version is $pytorch_version"
echo "torchvision version is $torchvision_version"
echo "ipex version is $ipex_version"
echo "onnx version is $onnx_version"
echo "onnxruntime version is $onnxruntime_version"
echo "mxnet version is $mxnet_version"
test_case=$1
echo -e "##[group]test case is ${test_case}"
if [[ "${tensorflow_version}" == *"-official" ]]; then
pip install tensorflow==${tensorflow_version%-official}
elif [[ "${tensorflow_version}" == "spr-base" ]]; then
pip install /tf_dataset/tf_binary/230928/tensorflow*.whl
pip install cmake
pip install protobuf==3.20.3
pip install horovod==0.27.0
if [[ $? -ne 0 ]]; then
exit 1
fi
elif [[ "${tensorflow_version}" != "" ]]; then
pip install intel-tensorflow==${tensorflow_version}
fi
if [[ "${itex_version}" != "" ]]; then
pip install --upgrade intel-extension-for-tensorflow[cpu]==${itex_version}
pip install tf2onnx
fi
if [[ "${pytorch_version}" != "" ]]; then
pip install torch==${pytorch_version} -f https://download.pytorch.org/whl/torch_stable.html
fi
if [[ "${torchvision_version}" != "" ]]; then
pip install torchvision==${torchvision_version} -f https://download.pytorch.org/whl/torch_stable.html
fi
if [[ "${ipex_version}" != "" ]]; then
pip install intel-extension-for-pytorch=="${ipex_version%+cpu}"
fi
if [[ "${onnx_version}" != "" ]]; then
pip install onnx==${onnx_version}
fi
if [[ "${onnxruntime_version}" != "" ]]; then
pip install onnxruntime==${onnxruntime_version}
if [[ "${onnxruntime_version}" == "1.14"* ]]; then
pip install onnxruntime-extensions==0.8.0
else
pip install onnxruntime-extensions
fi
pip install optimum
fi
if [ "${mxnet_version}" != '' ]; then
pip install numpy==1.23.5
echo "re-install pycocotools resolve the issue with numpy..."
pip uninstall pycocotools -y
pip install --no-cache-dir pycocotools
pip install mxnet==${mxnet_version}
fi
# install special test env requirements
# common deps
pip install cmake
pip install transformers
if [[ $(echo "${test_case}" | grep -c "others") != 0 ]];then
pip install tf_slim xgboost accelerate==0.21.0 peft
elif [[ $(echo "${test_case}" | grep -c "nas") != 0 ]]; then
pip install dynast==1.6.0rc1
elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
pip install tensorflow-addons
# Workaround
# horovod can't be install in the env with TF and PT together
# so test distribute cases in the env with single fw installed
pip install horovod
fi
if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
pip install auto-round
fi
# test deps
pip install coverage
pip install pytest
pip install pytest-html
echo "##[endgroup]"
pip list
echo "[DEBUG] list pipdeptree..."
pip install pipdeptree
pipdeptree
# import torch before import tensorflow
if [[ $(echo "${test_case}" | grep -c "run basic api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "run basic others") != 0 ]] || [[ $(echo "${test_case}" | grep -c "run basic adaptor") != 0 ]]; then
cd /neural-compressor/test || exit 1
find . -name "test*.py" | xargs sed -i 's/import tensorflow as tf/import torch; import tensorflow as tf/g'
find . -name "test*.py" | xargs sed -i 's/import tensorflow.compat.v1 as tf/import torch; import tensorflow.compat.v1 as tf/g'
find . -name "test*.py" | xargs sed -i 's/from tensorflow import keras/import torch; from tensorflow import keras/g'
fi

View File

@@ -0,0 +1,35 @@
#!/bin/bash
python -c "import neural_compressor as nc;print(nc.version.__version__)"
test_case="run basic adaptor"
echo "${test_case}"
echo "specify fwk version..."
source /neural-compressor/.azure-pipelines/scripts/fwk_version.sh $1
echo "set up UT env..."
bash /neural-compressor/.azure-pipelines/scripts/ut/env_setup.sh "${test_case}"
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file
lpot_path=$(python -c 'import neural_compressor; import os; print(os.path.dirname(neural_compressor.__file__))')
cd /neural-compressor/test || exit 1
find ./adaptor -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'> run.sh
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_adaptor.log
echo "cat run.sh..."
sort run.sh -o run.sh
cat run.sh | tee ${ut_log_name}
echo "------UT start-------"
bash -x run.sh 2>&1 | tee -a ${ut_log_name}
echo "------UT end -------"
if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
echo "Find errors in UT test, please check the output..."
exit 1
fi
cp .coverage ${LOG_DIR}/.coverage.adaptor
echo "UT finished successfully! "

View File

@@ -0,0 +1,33 @@
#!/bin/bash
python -c "import neural_compressor as nc;print(nc.version.__version__)"
test_case="run basic tfnewapi"
echo "${test_case}"
echo "specify fwk version..."
export tensorflow_version='spr-base'
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file
# export FORCE_BF16=1
echo "set up UT env..."
bash /neural-compressor/.azure-pipelines/scripts/ut/env_setup.sh "${test_case}"
lpot_path=$(python -c 'import neural_compressor; import os; print(os.path.dirname(neural_compressor.__file__))')
cd /neural-compressor/test || exit 1
find ./tfnewapi -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'> run.sh
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_tf_newapi.log
echo "cat run.sh..."
sort run.sh -o run.sh
cat run.sh | tee ${ut_log_name}
echo "------UT start-------"
bash -x run.sh 2>&1 | tee -a ${ut_log_name}
echo "------UT end -------"
if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
echo "Find errors in UT test, please check the output..."
exit 1
fi
cp .coverage ${LOG_DIR}/.coverage.tfnewapi
echo "UT finished successfully! "

View File

@@ -0,0 +1,38 @@
#!/bin/bash
python -c "import neural_compressor as nc;print(nc.version.__version__)"
test_case="run basic api quantization/benchmark/export/mixed_precision/distillation/scheduler/nas"
echo "${test_case}"
echo "specify fwk version..."
source /neural-compressor/.azure-pipelines/scripts/fwk_version.sh $1
echo "set up UT env..."
bash /neural-compressor/.azure-pipelines/scripts/ut/env_setup.sh "${test_case}"
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file
lpot_path=$(python -c 'import neural_compressor; import os; print(os.path.dirname(neural_compressor.__file__))')
cd /neural-compressor/test || exit 1
find ./quantization* -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'> run.sh
find ./benchmark* -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'>> run.sh
find ./export* -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'>> run.sh
find ./mixed_precision* -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'>> run.sh
find ./distillation -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'>> run.sh
find ./scheduler -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'>> run.sh
find ./nas -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'>> run.sh
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_api.log
echo "cat run.sh..."
sort run.sh -o run.sh
cat run.sh | tee ${ut_log_name}
echo "------UT start-------"
bash -x run.sh 2>&1 | tee -a ${ut_log_name}
echo "------UT end -------"
if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
echo "Find errors in UT test, please check the output..."
exit 1
fi
cp .coverage ${LOG_DIR}/.coverage.api
echo "UT finished successfully! "

View File

@@ -0,0 +1,35 @@
#!/bin/bash
python -c "import neural_compressor as nc;print(nc.version.__version__)"
test_case="run basic itex"
echo "${test_case}"
echo "specify fwk version..."
export itex_version='2.15.0.0'
export tensorflow_version='2.15.0-official'
export onnx_version='1.16.0'
export onnxruntime_version='1.18.0'
echo "set up UT env..."
bash /neural-compressor/.azure-pipelines/scripts/ut/env_setup.sh "${test_case}"
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file
lpot_path=$(python -c 'import neural_compressor; import os; print(os.path.dirname(neural_compressor.__file__))')
cd /neural-compressor/test || exit 1
find ./itex -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'> run.sh
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_itex.log
echo "cat run.sh..."
sort run.sh -o run.sh
cat run.sh | tee ${ut_log_name}
echo "------UT start-------"
bash -x run.sh 2>&1 | tee -a ${ut_log_name}
echo "------UT end -------"
if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
echo "Find errors in UT test, please check the output..."
exit 1
fi
cp .coverage ${LOG_DIR}/.coverage.itex
echo "UT finished successfully! "

View File

@@ -0,0 +1,50 @@
#!/bin/bash
python -c "import neural_compressor as nc;print(nc.version.__version__)"
test_case="run basic others"
echo "${test_case}"
echo "specify fwk version..."
source /neural-compressor/.azure-pipelines/scripts/fwk_version.sh $1
echo "set up UT env..."
bash /neural-compressor/.azure-pipelines/scripts/ut/env_setup.sh "${test_case}"
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file
lpot_path=$(python -c 'import neural_compressor; import os; print(os.path.dirname(neural_compressor.__file__))')
cd /neural-compressor/test || exit 1
find . -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'> run.sh
sed -i '/ adaptor\//d' run.sh
sed -i '/ tfnewapi\//d' run.sh
sed -i '/ itex\//d' run.sh
sed -i '/ pruning_with_pt/d' run.sh
sed -i '/ pruning_with_tf/d' run.sh
sed -i '/ quantization/d' run.sh
sed -i '/ benchmark/d' run.sh
sed -i '/ export/d' run.sh
sed -i '/ mixed_precision/d' run.sh
sed -i '/ distillation\//d' run.sh
sed -i '/ scheduler\//d' run.sh
sed -i '/ nas\//d' run.sh
sed -i '/ 3x\//d' run.sh
sed -i '/ distributed\//d' run.sh
echo "copy model for dynas..."
mkdir -p .torch/ofa_nets || true
cp -r /tf_dataset/ut-localfile/ofa_mbv3_d234_e346_k357_w1.2 .torch/ofa_nets || true
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_others.log
echo "cat run.sh..."
sort run.sh -o run.sh
cat run.sh | tee ${ut_log_name}
echo "------UT start-------"
bash -x run.sh 2>&1 | tee -a ${ut_log_name}
echo "------UT end -------"
if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
echo "Find errors in UT test, please check the output..."
exit 1
fi
cp .coverage ${LOG_DIR}/.coverage.others
echo "UT finished successfully! "

View File

@@ -0,0 +1,35 @@
#!/bin/bash
python -c "import neural_compressor as nc;print(nc.version.__version__)"
test_case="run basic pt pruning"
echo "${test_case}"
echo "specify fwk version..."
export pytorch_version='2.4.0+cpu'
export torchvision_version='0.18.0+cpu'
export ipex_version='2.4.0+cpu'
echo "set up UT env..."
bash /neural-compressor/.azure-pipelines/scripts/ut/env_setup.sh "${test_case}"
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file
lpot_path=$(python -c 'import neural_compressor; import os; print(os.path.dirname(neural_compressor.__file__))')
cd /neural-compressor/test || exit 1
find ./pruning_with_pt -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'> run.sh
# find ./distributed -name "test_distributed_pt_train.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'>> run.sh
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_pt_pruning.log
echo "cat run.sh..."
sort run.sh -o run.sh
cat run.sh | tee ${ut_log_name}
echo "------UT start-------"
bash -x run.sh 2>&1 | tee -a ${ut_log_name}
echo "------UT end -------"
if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
echo "Find errors in UT test, please check the output..."
exit 1
fi
cp .coverage ${LOG_DIR}/.coverage.pt_pruning
echo "UT finished successfully! "

View File

@@ -0,0 +1,33 @@
#!/bin/bash
python -c "import neural_compressor as nc;print(nc.version.__version__)"
test_case="run basic tf pruning"
echo "${test_case}"
echo "specify fwk version..."
export tensorflow_version='2.14.0'
echo "set up UT env..."
bash /neural-compressor/.azure-pipelines/scripts/ut/env_setup.sh "${test_case}"
export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.file
lpot_path=$(python -c 'import neural_compressor; import os; print(os.path.dirname(neural_compressor.__file__))')
cd /neural-compressor/test || exit 1
find ./pruning_with_tf -name "test*.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'> run.sh
find ./distributed -name "test_distributed_tf_dataloader.py" | sed 's,\.\/,coverage run --source='"${lpot_path}"' --append ,g' | sed 's/$/ --verbose/'>> run.sh
LOG_DIR=/neural-compressor/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut_tf_pruning.log
echo "cat run.sh..."
sort run.sh -o run.sh
cat run.sh | tee ${ut_log_name}
echo "------UT start-------"
bash -x run.sh 2>&1 | tee -a ${ut_log_name}
echo "------UT end -------"
if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
echo "Find errors in UT test, please check the output..."
exit 1
fi
cp .coverage ${LOG_DIR}/.coverage.tf_pruning
echo "UT finished successfully! "

View File

@@ -0,0 +1,42 @@
parameters:
- name: codeScanFileName
type: string
- name: uploadPath
type: string
- name: codeScanContainerName
type: string
default: "codeScan"
- name: scanModule
type: string
default: "neural_compressor"
steps:
- template: docker-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
repoName: "code-scan"
repoTag: "1.0"
dockerFileName: "DockerfileCodeScan"
containerName: ${{ parameters.codeScanContainerName }}
- script: |
docker exec ${{ parameters.codeScanContainerName }} bash -c "bash /neural-compressor/.azure-pipelines/scripts/codeScan/${{ parameters.codeScanFileName }}/${{ parameters.codeScanFileName }}.sh \
--scan_module=${{ parameters.scanModule }}"
displayName: "${{ parameters.codeScanFileName }} Check"
- task: PublishPipelineArtifact@1
condition: succeededOrFailed()
inputs:
targetPath: .azure-pipelines/scripts/codeScan/scanLog/${{ parameters.uploadPath }}
artifact: $(System.JobAttempt)_${{ parameters.codeScanFileName }}
publishLocation: "pipeline"
displayName: "PublishPipelineArtifact"
- task: Bash@3
condition: always()
inputs:
targetType: "inline"
script: |
docker exec ${{ parameters.codeScanContainerName }} bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
displayName: "Docker clean up"

View File

@@ -0,0 +1,103 @@
parameters:
- name: dockerConfigName
type: string
default: "commonDockerConfig"
- name: repoName
type: string
default: "neural-compressor"
- name: repoTag
type: string
default: "py310"
- name: dockerFileName
type: string
default: "Dockerfile"
- name: containerName
type: string
- name: repo
type: string
default: "https://github.com/intel/neural-compressor"
- name: imageSource
type: string
default: "build"
steps:
- task: Bash@3
inputs:
targetType: "inline"
script: |
docker ps -a
if [[ $(docker ps -a | grep -i '${{ parameters.containerName }}'$) ]]; then
docker start $(docker ps -aq --filter "name=${{ parameters.containerName }}")
echo "remove left files through container ..."
docker exec ${{ parameters.containerName }} bash -c "ls -a /neural-compressor && rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* && ls -a /neural-compressor || true"
fi
displayName: "Docker workspace clean up"
- ${{ if eq(parameters.dockerConfigName, 'commonDockerConfig') }}:
- script: |
rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
displayName: "Clean workspace"
- checkout: self
clean: true
displayName: "Checkout out Repo"
fetchDepth: 0
- ${{ if eq(parameters.dockerConfigName, 'gitCloneDockerConfig') }}:
- script: |
rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
mkdir ${BUILD_SOURCESDIRECTORY}
chmod 777 ${BUILD_SOURCESDIRECTORY}
displayName: "Clean workspace"
- checkout: none
- script: |
git clone ${{ parameters.repo }} ${BUILD_SOURCESDIRECTORY}
git config --global --add safe.directory ${BUILD_SOURCESDIRECTORY}
cd ${BUILD_SOURCESDIRECTORY}
git checkout master
displayName: "Checkout out master"
- ${{ if eq(parameters.imageSource, 'build') }}:
- script: |
docker image prune -a -f
if [[ ! $(docker images | grep -i ${{ parameters.repoName }}:${{ parameters.repoTag }}) ]]; then
docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/${{parameters.dockerFileName}}.devel -t ${{ parameters.repoName }}:${{ parameters.repoTag }} .
fi
docker images | grep -i ${{ parameters.repoName }}
if [[ $? -ne 0 ]]; then
echo "NO Such Repo"
exit 1
fi
displayName: "Build develop docker image"
- ${{ if eq(parameters.imageSource, 'pull') }}:
- script: |
docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
displayName: "Pull habana docker image"
- script: |
docker stop $(docker ps -aq --filter "name=${{ parameters.containerName }}")
docker rm -vf ${{ parameters.containerName }} || true
env | sort
displayName: "Clean docker container"
- ${{ if ne(parameters.containerName, '') }}:
- task: Bash@3
inputs:
targetType: "inline"
script: |
if [[ "${{ parameters.imageSource }}" == "build" ]]; then
docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
-v ${BUILD_SOURCESDIRECTORY}:/neural-compressor -v /tf_dataset:/tf_dataset -v /tf_dataset2:/tf_dataset2 \
${{ parameters.repoName }}:${{ parameters.repoTag }}
else
docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
-v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
docker exec ${{ parameters.containerName }} bash -c "ln -sf \$(which python3) /usr/bin/python"
fi
echo "Show the container list after docker run ... "
docker ps -a
displayName: "Docker run - ${{ parameters.containerName }} Container"

View File

@@ -0,0 +1,80 @@
parameters:
- name: modelName
type: string
default: "resnet50v1.5"
- name: framework
type: string
default: "tensorflow"
- name: APIVersion
type: string
default: ""
- name: modelContainerName
type: string
default: "model"
steps:
- template: docker-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
repoName: "neural-compressor"
repoTag: "py310"
dockerFileName: "Dockerfile"
containerName: ${{ parameters.modelContainerName }}
- script: |
docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
&& bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='env_setup'"
displayName: Env setup
- task: DownloadPipelineArtifact@2
continueOnError: true
inputs:
source: "specific"
artifact: ${{ parameters.framework }}_${{ parameters.modelName }}
patterns: "**_summary.log"
path: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models/${{ parameters.modelName }}_refer_log
project: $(System.TeamProject)
pipeline: "Model-Test"
runVersion: "specific"
runId: $(refer_buildId)
retryDownloadCount: 3
displayName: "Download refer logs"
- script: |
docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
&& bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='tuning'"
displayName: Quantization
- ${{ if ne(parameters.APIVersion, '3x') }}:
- script: |
docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
&& bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='int8_benchmark' --USE_TUNE_ACC=$(USE_TUNE_ACC) --PERF_STABLE_CHECK=$(PERF_STABLE_CHECK)"
displayName: INT8 Benchmark
- script: |
docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
&& bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='fp32_benchmark' --USE_TUNE_ACC=$(USE_TUNE_ACC) --PERF_STABLE_CHECK=$(PERF_STABLE_CHECK)"
displayName: FP32 Benchmark
- task: Bash@3
inputs:
targetType: "inline"
script: |
docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
&& bash run_${{ parameters.framework }}_models_trigger.sh --model=${{ parameters.modelName }} --mode='collect_log' --BUILD_BUILDID=$(Build.BuildId)"
displayName: Collect log
- task: PublishPipelineArtifact@1
inputs:
targetPath: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models/${{ parameters.modelName }}/
artifact: ${{ parameters.framework }}_${{ parameters.modelName }}
publishLocation: "pipeline"
- task: Bash@3
condition: always()
inputs:
targetType: "inline"
script: |
docker exec ${{ parameters.modelContainerName }} bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
displayName: "Docker clean up"

View File

@@ -0,0 +1,61 @@
parameters:
- name: dockerConfigName
type: string
default: "commonDockerConfig"
- name: repo
type: string
default: "https://github.com/intel/neural-compressor"
- name: utScriptFileName
type: string
- name: uploadPath
type: string
- name: utArtifact
type: string
- name: utTestMode
type: string
default: "coverage"
- name: utContainerName
type: string
default: "utTest"
- name: imageSource
type: string
default: "build"
steps:
- template: docker-template.yml
parameters:
dockerConfigName: ${{ parameters.dockerConfigName }}
repoName: "neural-compressor"
repoTag: "py310"
dockerFileName: "Dockerfile"
containerName: ${{ parameters.utContainerName }}
repo: ${{ parameters.repo }}
imageSource: ${{ parameters.imageSource }}
- script: |
docker exec ${{ parameters.utContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts \
&& bash install_nc.sh ${{ parameters.utScriptFileName }} \
&& bash ut/${{ parameters.utScriptFileName }}.sh ${{ parameters.utTestMode }}"
displayName: "Run UT"
- task: PublishPipelineArtifact@1
condition: succeededOrFailed()
inputs:
targetPath: ${{ parameters.uploadPath }}
artifact: $(System.JobAttempt)_${{ parameters.utArtifact }}_report
publishLocation: "pipeline"
- ${{ if eq(parameters.utTestMode, 'coverage') }}:
- task: PublishPipelineArtifact@1
inputs:
targetPath: ${{ parameters.uploadPath }}
artifact: ${{ parameters.utArtifact }}_coverage
publishLocation: "pipeline"
- task: Bash@3
condition: always()
inputs:
targetType: "inline"
script: |
docker exec ${{ parameters.utContainerName }} bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
displayName: "Docker clean up"

View File

@@ -0,0 +1,118 @@
trigger: none
pr:
autoCancel: true
drafts: false
branches:
include:
- master
paths:
include:
- .azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
- .azure-pipelines/scripts/install_nc.sh
- .azure-pipelines/ut-3x-pt-fp8.yml
- .azure-pipelines/template/docker-template.yml
- neural_compressor/common
- neural_compressor/torch
- test/3x/torch/algorithms/fp8_quant
- test/3x/torch/quantization/fp8_quant
- test/3x/torch/quantization/weight_only/test_rtn.py
- test/3x/torch/quantization/weight_only/test_load.py
- setup.py
- requirements_pt.txt
pool: GAUDI
variables:
IMAGE_NAME: "neural-compressor"
IMAGE_TAG: "py310"
UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
ARTIFACT_NAME: "UT_coverage_report_3x_pt_fp8"
REPO: $(Build.Repository.Uri)
stages:
- stage: Torch_habana
displayName: Torch 3x Habana FP8
dependsOn: []
jobs:
- job:
displayName: Torch 3x Habana FP8
steps:
- template: template/ut-template.yml
parameters:
imageSource: "pull"
dockerConfigName: "commonDockerConfig"
utScriptFileName: "3x/run_3x_pt_fp8"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_3x"
- stage: Torch_habana_baseline
displayName: Torch 3x Habana FP8 baseline
dependsOn: []
jobs:
- job:
displayName: Torch 3x Habana FP8 baseline
steps:
- template: template/ut-template.yml
parameters:
imageSource: "pull"
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "3x/run_3x_pt_fp8"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_3x_baseline"
- stage: Coverage
displayName: "Coverage Compare"
pool:
vmImage: "ubuntu-latest"
dependsOn: [Torch_habana, Torch_habana_baseline]
jobs:
- job: CollectDatafiles
steps:
- script: |
if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then
docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} .
fi
docker images | grep -i ${IMAGE_NAME}
if [[ $? -ne 0 ]]; then
echo "NO Such Repo"
exit 1
fi
displayName: "Build develop docker image"
- task: DownloadPipelineArtifact@2
inputs:
artifact:
patterns: '*_coverage/.coverage'
path: $(DOWNLOAD_PATH)
- script: |
echo "--- create container ---"
docker run -d -it --name="collectLogs" -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash
echo "--- docker ps ---"
docker ps
echo "--- collect logs ---"
docker exec collectLogs /bin/bash +x -c "cd /neural-compressor/.azure-pipelines/scripts \
&& bash install_nc.sh 3x_pt_fp8 \
&& bash ut/3x/collect_log_3x.sh 3x_pt_fp8"
displayName: "Collect UT Coverage"
- task: PublishCodeCoverageResults@2
inputs:
summaryFileLocation: $(Build.SourcesDirectory)/log_dir/coverage_PR/coverage.xml
- task: PublishPipelineArtifact@1
condition: succeededOrFailed()
inputs:
targetPath: $(UPLOAD_PATH)
artifact: $(ARTIFACT_NAME)
publishLocation: "pipeline"
- task: Bash@3
condition: always()
inputs:
targetType: "inline"
script: |
docker exec collectLogs bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
displayName: "Docker clean up"

View File

@@ -0,0 +1,116 @@
trigger: none
pr:
autoCancel: true
drafts: false
branches:
include:
- master
paths:
include:
- neural_compressor/common
- neural_compressor/torch
- test/3x/torch
- test/3x/common
- setup.py
- requirements_pt.txt
- .azure-pipelines/ut-3x-pt.yml
- .azure-pipelines/template/docker-template.yml
- .azure-pipelines/scripts/install_nc.sh
- .azure-pipelines/scripts/ut/3x/run_3x_pt.sh
pool: ICX-16C
variables:
IMAGE_NAME: "neural-compressor"
IMAGE_TAG: "py310"
UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
ARTIFACT_NAME: "UT_coverage_report_3x_pt"
REPO: $(Build.Repository.Uri)
stages:
- stage: Torch
displayName: Unit Test 3x Torch
dependsOn: []
jobs:
- job:
displayName: Unit Test 3x Torch
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
utScriptFileName: "3x/run_3x_pt"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_3x"
- stage: Torch_baseline
displayName: Unit Test 3x Torch baseline
dependsOn: []
jobs:
- job:
displayName: Unit Test 3x Torch baseline
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "3x/run_3x_pt"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_3x_baseline"
repo: $(REPO)
- stage: Coverage
displayName: "Coverage Compare"
pool:
vmImage: "ubuntu-latest"
dependsOn: [Torch, Torch_baseline]
jobs:
- job: CollectDatafiles
steps:
- script: |
if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then
docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} .
fi
docker images | grep -i ${IMAGE_NAME}
if [[ $? -ne 0 ]]; then
echo "NO Such Repo"
exit 1
fi
displayName: "Build develop docker image"
- task: DownloadPipelineArtifact@2
inputs:
artifact:
patterns: '*_coverage/.coverage'
path: $(DOWNLOAD_PATH)
- script: |
echo "--- create container ---"
docker run -d -it --name="collectLogs" -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash
echo "--- docker ps ---"
docker ps
echo "--- collect logs ---"
docker exec collectLogs /bin/bash +x -c "cd /neural-compressor/.azure-pipelines/scripts \
&& bash install_nc.sh 3x_pt \
&& bash ut/3x/collect_log_3x.sh 3x_pt"
displayName: "Collect UT Coverage"
- task: PublishCodeCoverageResults@2
inputs:
summaryFileLocation: $(Build.SourcesDirectory)/log_dir/coverage_PR/coverage.xml
- task: PublishPipelineArtifact@1
condition: succeededOrFailed()
inputs:
targetPath: $(UPLOAD_PATH)
artifact: $(ARTIFACT_NAME)
publishLocation: "pipeline"
- task: Bash@3
condition: always()
inputs:
targetType: "inline"
script: |
docker exec collectLogs bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
displayName: "Docker clean up"

View File

@@ -0,0 +1,113 @@
trigger: none
pr:
autoCancel: true
drafts: false
branches:
include:
- master
paths:
include:
- neural_compressor/common
- neural_compressor/tensorflow
- test/3x/tensorflow
- test/3x/common
- setup.py
- requirements_tf.txt
- .azure-pipelines/scripts/ut/3x/run_3x_tf.sh
- .azure-pipelines/template/docker-template.yml
pool: ICX-16C
variables:
IMAGE_NAME: "neural-compressor"
IMAGE_TAG: "py310"
UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
ARTIFACT_NAME: "UT_coverage_report_3x_tf"
REPO: $(Build.Repository.Uri)
stages:
- stage: TensorFlow
displayName: Unit Test 3x TensorFlow
dependsOn: []
jobs:
- job:
displayName: Unit Test 3x TensorFlow
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
utScriptFileName: "3x/run_3x_tf"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_3x"
- stage: TensorFlow_baseline
displayName: Unit Test 3x TensorFlow baseline
dependsOn: []
jobs:
- job:
displayName: Unit Test 3x TensorFlow baseline
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "3x/run_3x_tf"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_3x_baseline"
repo: $(REPO)
- stage: Coverage
displayName: "Coverage Compare"
pool:
vmImage: "ubuntu-latest"
dependsOn: [TensorFlow, TensorFlow_baseline]
jobs:
- job: CollectDatafiles
steps:
- script: |
if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then
docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} .
fi
docker images | grep -i ${IMAGE_NAME}
if [[ $? -ne 0 ]]; then
echo "NO Such Repo"
exit 1
fi
displayName: "Build develop docker image"
- task: DownloadPipelineArtifact@2
inputs:
artifact:
patterns: '*_coverage/.coverage'
path: $(DOWNLOAD_PATH)
- script: |
echo "--- create container ---"
docker run -d -it --name="collectLogs" -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash
echo "--- docker ps ---"
docker ps
echo "--- collect logs ---"
docker exec collectLogs /bin/bash +x -c "cd /neural-compressor/.azure-pipelines/scripts \
&& bash install_nc.sh 3x_tf \
&& bash ut/3x/collect_log_3x.sh 3x_tf"
displayName: "Collect UT Coverage"
- task: PublishCodeCoverageResults@2
inputs:
summaryFileLocation: $(Build.SourcesDirectory)/log_dir/coverage_PR/coverage.xml
- task: PublishPipelineArtifact@1
condition: succeededOrFailed()
inputs:
targetPath: $(UPLOAD_PATH)
artifact: $(ARTIFACT_NAME)
publishLocation: "pipeline"
- task: Bash@3
condition: always()
inputs:
targetType: "inline"
script: |
docker exec collectLogs bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
displayName: "Docker clean up"

View File

@@ -0,0 +1,287 @@
trigger: none
pr:
autoCancel: true
drafts: false
branches:
include:
- master
paths:
include:
- neural_compressor
- test
- setup.py
- requirements.txt
- .azure-pipelines/ut-basic.yml
- .azure-pipelines/template/docker-template.yml
- .azure-pipelines/scripts/ut
- .azure-pipelines/scripts/fwk_version.sh
- .azure-pipelines/scripts/install_nc.sh
exclude:
- test/3x
- neural_compressor/common
- neural_compressor/torch
- neural_compressor/tensorflow
- neural_compressor/onnxrt
- neural_compressor/transformers
- neural_compressor/evaluation
- .azure-pipelines/scripts/ut/3x
pool: ICX-16C
variables:
IMAGE_NAME: "neural-compressor"
IMAGE_TAG: "py310"
UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
ARTIFACT_NAME: "UT_coverage_report"
REPO: $(Build.Repository.Uri)
stages:
- stage: Adaptor
displayName: Unit Test FWKs adaptor
dependsOn: []
jobs:
- job:
displayName: Test FWKs adaptor
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
utScriptFileName: "run_basic_adaptor"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_adaptor"
- stage: API
displayName: Unit Test User facing API
dependsOn: []
jobs:
- job:
displayName: Test User facing API
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
utScriptFileName: "run_basic_api"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_api"
- stage: Pruning
displayName: Unit Test Pruning
dependsOn: []
jobs:
- job:
displayName: Test PyTorch Pruning
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
utScriptFileName: "run_basic_pt_pruning"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_pt-pruning"
- job:
displayName: Test TensorFlow Pruning
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
utScriptFileName: "run_basic_tf_pruning"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_tf-pruning"
- stage: TFNewAPI
displayName: Unit Test TF newAPI
dependsOn: []
jobs:
- job:
displayName: Test TF newAPI
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
utScriptFileName: "run_basic_adaptor_tfnewapi"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_tfnewapi"
- stage: ITEX
displayName: Unit Test ITEX
dependsOn: []
jobs:
- job:
displayName: Test ITEX
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
utScriptFileName: "run_basic_itex"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_itex"
- stage: Others
displayName: Unit Test other basic case
dependsOn: []
jobs:
- job:
displayName: Test other basic case
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "commonDockerConfig"
utScriptFileName: "run_basic_others"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut_others"
- stage: Adaptor_base
displayName: Unit Test FWKs adaptor baseline
dependsOn: []
jobs:
- job:
displayName: Test FWKs adaptor baseline
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "run_basic_adaptor"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut-base_adaptor"
repo: $(REPO)
- stage: API_base
displayName: Unit Test User facing API baseline
dependsOn: []
jobs:
- job:
displayName: Test User facing API baseline
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "run_basic_api"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut-base_api"
repo: $(REPO)
- stage: Pruning_base
displayName: Unit Test Pruning baseline
dependsOn: []
jobs:
- job:
displayName: Test PyTorch Pruning baseline
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "run_basic_pt_pruning"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut-base_pt-pruning"
repo: $(REPO)
- job:
displayName: Test TensorFlow Pruning baseline
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "run_basic_tf_pruning"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut-base_tf-pruning"
repo: $(REPO)
- stage: TFNewAPI_base
displayName: Unit Test TF newAPI baseline
dependsOn: []
jobs:
- job:
displayName: Test TF newAPI baseline
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "run_basic_adaptor_tfnewapi"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut-base_tfnewapi"
repo: $(REPO)
- stage: ITEX_base
displayName: Unit Test ITEX baseline
dependsOn: []
jobs:
- job:
displayName: Test ITEX baseline
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "run_basic_itex"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut-base_itex"
repo: $(REPO)
- stage: Others_base
displayName: Unit Test other cases baseline
dependsOn: []
jobs:
- job:
displayName: Test other cases baseline
steps:
- template: template/ut-template.yml
parameters:
dockerConfigName: "gitCloneDockerConfig"
utScriptFileName: "run_basic_others"
uploadPath: $(UPLOAD_PATH)
utArtifact: "ut-base_others"
repo: $(REPO)
- stage: Coverage
displayName: "Coverage Compare"
pool:
vmImage: "ubuntu-latest"
dependsOn: [Adaptor, API, Pruning, TFNewAPI, ITEX, Others, Adaptor_base, API_base, Pruning_base, TFNewAPI_base, ITEX_base, Others_base]
jobs:
- job: CollectDatafiles
steps:
- script: |
if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then
docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} .
fi
docker images | grep -i ${IMAGE_NAME}
if [[ $? -ne 0 ]]; then
echo "NO Such Repo"
exit 1
fi
displayName: "Build develop docker image"
- task: DownloadPipelineArtifact@2
inputs:
artifact:
patterns: '*_coverage/.coverage.*'
path: $(DOWNLOAD_PATH)
- script: |
echo "--- create container ---"
docker run -d -it --name="collectLogs" -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash
echo "--- docker ps ---"
docker ps
echo "--- collect logs ---"
docker exec collectLogs /bin/bash +x -c "cd /neural-compressor/.azure-pipelines/scripts \
&& bash install_nc.sh \
&& bash ut/collect_log.sh"
displayName: "Collect UT Coverage"
- task: PublishCodeCoverageResults@2
inputs:
summaryFileLocation: $(Build.SourcesDirectory)/log_dir/coverage_PR/coverage.xml
- task: PublishPipelineArtifact@1
condition: succeededOrFailed()
inputs:
targetPath: $(UPLOAD_PATH)
artifact: $(ARTIFACT_NAME)
publishLocation: "pipeline"
- task: Bash@3
condition: always()
inputs:
targetType: "inline"
script: |
docker exec collectLogs bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
displayName: "Docker clean up"

View File

@@ -0,0 +1,107 @@
custom_service_name: "CI checker"
subprojects:
- id: "Code Scan Tests workflow"
paths:
- "neural_compressor/**"
- "setup.py"
- "requirements.txt"
- ".azure-pipelines/code-scan.yml"
- ".azure-pipelines/scripts/codeScan/**"
checks:
- "Code-Scan"
- "Code-Scan (Bandit Code Scan Bandit)"
- "Code-Scan (DocStyle Code Scan DocStyle)"
- id: "Model Tests workflow"
paths:
- "neural_compressor/**"
- "setup.py"
- "requirements.txt"
- ".azure-pipelines/scripts/models/**"
- "examples/tensorflow/oob_models/quantization/ptq/**"
- "!test"
- "!neural_compressor/common/**"
- "!neural_compressor/torch/**"
- "!neural_compressor/tensorflow/**"
- "!neural_compressor/onnxrt/**"
checks:
- "Model-Test"
- "Model-Test (Generate Report GenerateReport)"
- "Model-Test (Run ONNX Model resnet50-v1-12)"
- "Model-Test (Run PyTorch Model resnet18_fx)"
- "Model-Test (Run TensorFlow Model resnet50v1.5)"
- "Model-Test (Run TensorFlow Model ssd_resnet50_v1)"
- id: "Model Tests 3x workflow"
paths:
- "neural_compressor/common/**"
- "neural_compressor/torch/**"
- "examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/**"
- "setup.py"
- "requirements_pt.txt"
- ".azure-pipelines/scripts/models/**"
checks:
- "Model-Test-3x"
- "Model-Test-3x (Generate Report GenerateReport)"
- "Model-Test-3x (Run PyTorch Model opt_125m_woq_gptq_int4)"
- "Model-Test-3x (Run PyTorch Model opt_125m_woq_gptq_nf4_dq_bnb)"
- "Model-Test-3x (Run PyTorch Model opt_125m_woq_gptq_int4_dq_ggml)"
- id: "Unit Tests basic workflow"
paths:
- "neural_compressor/**"
- "test/**"
- "setup.py"
- "requirements.txt"
- ".azure-pipelines/scripts/ut/**"
- "!test/3x/**"
- "!neural_compressor/common/**"
- "!neural_compressor/torch/**"
- "!neural_compressor/tensorflow/**"
- "!neural_compressor/onnxrt/**"
- "!.azure-pipelines/scripts/ut/3x/**"
checks:
- "UT-Basic"
- "UT-Basic (Coverage Compare CollectDatafiles)"
- "UT-Basic (Unit Test FWKs adaptor Test FWKs adaptor)"
- "UT-Basic (Unit Test FWKs adaptor baseline Test FWKs adaptor baseline)"
- "UT-Basic (Unit Test ITEX Test ITEX)"
- "UT-Basic (Unit Test ITEX baseline Test ITEX baseline)"
- "UT-Basic (Unit Test Pruning Test PyTorch Pruning)"
- "UT-Basic (Unit Test Pruning Test TensorFlow Pruning)"
- "UT-Basic (Unit Test Pruning baseline Test PyTorch Pruning baseline)"
- "UT-Basic (Unit Test Pruning baseline Test TensorFlow Pruning baseline)"
- "UT-Basic (Unit Test TF newAPI Test TF newAPI)"
- "UT-Basic (Unit Test TF newAPI baseline Test TF newAPI baseline)"
- "UT-Basic (Unit Test User facing API Test User facing API)"
- "UT-Basic (Unit Test User facing API baseline Test User facing API baseline)"
- "UT-Basic (Unit Test other basic case Test other basic case)"
- "UT-Basic (Unit Test other cases baseline Test other cases baseline)"
- id: "Unit Tests 3x-TensorFlow workflow"
paths:
- "neural_compressor/common/**"
- "neural_compressor/tensorflow/**"
- "test/3x/tensorflow/**"
- "setup.py"
- "requirements_tf.txt"
checks:
- "UT-3x-TensorFlow"
- "UT-3x-TensorFlow (Coverage Compare CollectDatafiles)"
- "UT-3x-TensorFlow (Unit Test 3x TensorFlow Unit Test 3x TensorFlow)"
- "UT-3x-TensorFlow (Unit Test 3x TensorFlow baseline Unit Test 3x TensorFlow baseline)"
- id: "Unit Tests 3x-PyTorch workflow"
paths:
- "neural_compressor/common/**"
- "neural_compressor/torch/**"
- "test/3x/torch/**"
- "test/3x/common/**"
- "setup.py"
- "requirements_pt.txt"
- ".azure-pipelines/scripts/ut/3x/collect_log_3x.sh"
checks:
- "UT-3x-Torch"
- "UT-3x-Torch (Coverage Compare CollectDatafiles)"
- "UT-3x-Torch (Unit Test 3x Torch Unit Test 3x Torch)"
- "UT-3x-Torch (Unit Test 3x Torch baseline Unit Test 3x Torch baseline)"

View File

@@ -0,0 +1,13 @@
Copyright (c) 2025 Intel Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -0,0 +1,20 @@
## Type of Change
feature or bug fix or documentation or validation or others
API changed or not
## Description
detail description
## Expected Behavior & Potential Risk
the expected behavior that triggered by this PR
## How has this PR been tested?
how to reproduce the test (including hardware information)
## Dependency Change?
any library dependency introduced or removed

View File

@@ -0,0 +1,18 @@
name: Scanner BDBA
permissions: read-all
on:
workflow_dispatch:
jobs:
bdba_job:
name: BDBA Scan
uses: intel-innersource/frameworks.ai.infrastructure.code-scan-tools/.github/workflows/Scanner_Bdba.yml@one-ci-cd
with:
repos: ${{ github.event.repository.name }}
refs: ${{ github.ref_name }}
group: "22"
runners: "['self-hosted']"
secrets:
token: ${{ secrets.GITHUB_TOKEN }}
BDBA_TOKEN: ${{ secrets.BDBA_TOKEN }}

View File

@@ -0,0 +1,20 @@
name: Scanner Coverity
permissions: read-all
on:
workflow_dispatch:
jobs:
coverity_job:
uses: intel-innersource/frameworks.ai.infrastructure.code-scan-tools/.github/workflows/Scanner_Coverity.yml@one-ci-cd
with:
repos: ${{ github.event.repository.name }}
refs: ${{ github.ref_name }}
projectType: python
url: 'https://coverityent.devtools.intel.com/prod1'
stream: 'IntelNeuralCompressor-master'
runners: "['self-hosted']"
secrets:
token: ${{ secrets.GITHUB_TOKEN }}
USER: ${{secrets.COVERITY_USER }}
PASSWORD : ${{secrets.COVERITY_PASSWORD }}

View File

@@ -0,0 +1,16 @@
name: Virus Scan
permissions: read-all
on:
workflow_dispatch:
jobs:
virus:
name: McAfee Virus Scan
runs-on: self-hosted
steps:
- uses: actions/checkout@v3
- name: Execute Scan
uses: intel-innersource/frameworks.devops.github.actions.mcafee@main
with:
scan_path: neural_compressor

View File

@@ -0,0 +1,27 @@
name: Probot
permissions: read-all
on:
pull_request:
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress: true
jobs:
required-jobs:
runs-on: ubuntu-latest
if: github.event.pull_request.draft == false
timeout-minutes: 361 # in case something is wrong with the internal timeout
steps:
- uses: XuehaoSun/probot@0.2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
job: check-group
interval: 180 # seconds
timeout: 360 # minutes
maintainers: "[XuehaoSun](https://github.com/XuehaoSun)"
owner: "[chensuyue](https://github.com/chensuyue) or [XuehaoSun](https://github.com/XuehaoSun)"

View File

@@ -0,0 +1,28 @@
name: Publish
permissions: {}
on:
push:
branches:
- master
jobs:
build:
runs-on: ubuntu-latest
permissions:
pull-requests: write
contents: write
steps:
- uses: actions/checkout@v3
- name: Build Online Document
run: |
git config --local --get remote.origin.url
cd docs/build_docs
bash build.sh latest
- name: Push to github
uses: peaceiris/actions-gh-pages@v3
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./build_tmp/gh-pages
publish_branch: gh-pages

View File

@@ -0,0 +1,24 @@
*.pyc
.vscode
.idea
/venv/
*/__pycache__
.ipynb_checkpoints/
*.snapshot
*.csv
*.pb
*.ckpt
*.log
*.swp
*.onnx
*.so
*.egg-info/
.eggs/
dist/
tags
build/
_build
lpot_workspace/
.torch/
node_modules
build_tmp

View File

@@ -0,0 +1,161 @@
ci:
autofix_prs: true
autoupdate_schedule: quarterly
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
files: (.*\.(py|md|rst|yaml|yml))$
exclude: |
(?x)^(
examples/.+|
neural_compressor/torch/algorithms/fp8_quant/.+|
test/3x/torch/algorithms/fp8_quant/.+
)$
- id: check-json
exclude: |
(?x)^(
.vscode/settings_recommended.json
)$
- id: check-yaml
exclude: |
(?x)^(
conda_meta/|
neural_compressor/template/pruning.yaml|
neural_compressor/adaptor/tensorflow_itex.yaml|
neural_compressor/adaptor/tensorflow.yaml
)$
- id: debug-statements
- id: file-contents-sorter
exclude: |
(?x)^(
examples/.+
)$
args: [--unique]
- id: requirements-txt-fixer
exclude: |
(?x)^(
examples/.+
)$
- id: trailing-whitespace
files: (.*\.(py|rst|cmake|yaml|yml))$
exclude: |
(?x)^(
examples/.+|
neural_compressor/torch/utils/.+|
neural_compressor/torch/algorithms/fp8_quant/.+|
test/3x/torch/quantization/.+
)$
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.5
hooks:
- id: insert-license
files: |
(?x)^(
neural_compressor/.*(py|yaml|yml|sh)
)$
args:
[
--license-filepath=.github/license_template.txt,
--use-current-year,
--detect-license-in-X-top-lines=40,
--skip-license-insertion-comment=Copyright,
]
- repo: https://github.com/asottile/yesqa
rev: v1.5.0
hooks:
- id: yesqa
name: Unused noqa
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
exclude: |
(?x)^(
examples/.+|
neural_compressor/torch/algorithms/fp8_quant/.+|
test/3x/torch/.+
)$
- repo: https://github.com/PyCQA/docformatter
rev: 06907d0
hooks:
- id: docformatter
args: [
--in-place,
--wrap-summaries=0, # 0 means disable wrap
--wrap-descriptions=0, # 0 means disable wrap
--black,
--style=google,
]
exclude: |
(?x)^(
examples/.+|
neural_compressor/torch/algorithms/fp8_quant/.+|
test/3x/torch/.+
)$
- repo: https://github.com/psf/black.git
rev: 24.10.0
hooks:
- id: black
files: (.*\.py)$
exclude: |
(?x)^(
neural_compressor/conf/config.py|
neural_compressor/conf/pythonic_config.py|
examples/.+|
neural_compressor/torch/algorithms/fp8_quant/.+|
test/3x/torch/.+
)$
- repo: https://github.com/asottile/blacken-docs
rev: 1.19.1
hooks:
- id: blacken-docs
args: [--line-length=120, --skip-errors]
additional_dependencies:
- black==24.10.0
exclude: |
(?x)^(
examples/.+|
docs/source-app|
neural_compressor/torch/algorithms/fp8_quant/.+|
test/3x/torch/.+
)$
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
hooks:
- id: codespell
args: [-w]
additional_dependencies:
- tomli
exclude: |
(?x)^(
examples/.*(txt|patch)|
examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/prompt.json|
examples/notebook/dynas/ResNet50_Quantiation_Search_Supernet_NAS.ipynb|
examples/notebook/dynas/Transformer_LT_Supernet_NAS.ipynb|
neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt|
neural_compressor/evaluation/hf_eval/datasets/cnn_validation.json|
neural_compressor/torch/algorithms/fp8_quant/.+|
test/3x/torch/.+
)$
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.6
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix, --no-cache]
exclude: |
(?x)^(
examples/.+|
neural_compressor/torch/algorithms/fp8_quant/.+|
test/3x/torch/.+
)$

View File

@@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
============================================================================
Copyright 2016-2019 Intel Corporation
Copyright 2018 YANDEX LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This distribution includes third party software ("third party programs").
This third party software, even if included with the distribution of
the Intel software, may be governed by separate license terms, including
without limitation, third party license terms, other Intel software license
terms, and open source software license terms. These separate license terms
govern your use of the third party programs as set forth in the
"THIRD-PARTY-PROGRAMS" file.

View File

@@ -0,0 +1,199 @@
<div align="center">
Intel® Neural Compressor
===========================
<h3> An open-source Python library supporting popular model compression techniques on all mainstream deep learning frameworks (TensorFlow, PyTorch, and ONNX Runtime)</h3>
[![python](https://img.shields.io/badge/python-3.8%2B-blue)](https://github.com/intel/neural-compressor)
[![version](https://img.shields.io/badge/release-3.1.1-green)](https://github.com/intel/neural-compressor/releases)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/intel/neural-compressor/blob/master/LICENSE)
[![coverage](https://img.shields.io/badge/coverage-85%25-green)](https://github.com/intel/neural-compressor)
[![Downloads](https://static.pepy.tech/personalized-badge/neural-compressor?period=total&units=international_system&left_color=grey&right_color=green&left_text=downloads)](https://pepy.tech/project/neural-compressor)
[Architecture](./docs/source/3x/design.md#architecture)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Workflow](./docs/source/3x/design.md#workflows)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[LLMs Recipes](./docs/source/llm_recipes.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/validated_model_list.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentations](https://intel.github.io/neural-compressor)
---
<div align="left">
Intel® Neural Compressor aims to provide popular model compression techniques such as quantization, pruning (sparsity), distillation, and neural architecture search on mainstream frameworks such as [TensorFlow](https://www.tensorflow.org/), [PyTorch](https://pytorch.org/), and [ONNX Runtime](https://onnxruntime.ai/),
as well as Intel extensions such as [Intel Extension for TensorFlow](https://github.com/intel/intel-extension-for-tensorflow) and [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch).
In particular, the tool provides the key features, typical examples, and open collaborations as below:
* Support a wide range of Intel hardware such as [Intel Gaudi Al Accelerators](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html), [Intel Core Ultra Processors](https://www.intel.com/content/www/us/en/products/details/processors/core-ultra.html), [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html), [Intel Xeon CPU Max Series](https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html), [Intel Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html), and [Intel Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) with extensive testing;
support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testing; support NVidia GPU for some WOQ algorithms like AutoRound and HQQ.
* Validate popular LLMs such as [LLama2](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Falcon](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [GPT-J](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Bloom](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [OPT](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), and more than 10,000 broad models such as [Stable Diffusion](/examples/pytorch/nlp/huggingface_models/text-to-image/quantization), [BERT-Large](/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx), and [ResNet50](/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx) from popular model hubs such as [Hugging Face](https://huggingface.co/), [Torch Vision](https://pytorch.org/vision/stable/index.html), and [ONNX Model Zoo](https://github.com/onnx/models#models), with automatic [accuracy-driven](/docs/source/design.md#workflow) quantization strategies
* Collaborate with cloud marketplaces such as [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [Amazon Web Services](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel), software platforms such as [Alibaba Cloud](https://www.intel.com/content/www/us/en/developer/articles/technical/quantize-ai-by-oneapi-analytics-on-alibaba-cloud.html), [Tencent TACO](https://new.qq.com/rain/a/20221202A00B9S00) and [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [PyTorch](https://pytorch.org/tutorials/recipes/intel_neural_compressor_for_pytorch.html), [ONNX](https://github.com/onnx/models#models), [ONNX Runtime](https://github.com/microsoft/onnxruntime), and [Lightning AI](https://github.com/Lightning-AI/lightning/blob/master/docs/source-pytorch/advanced/post_training_quantization.rst)
## What's New
* [2024/10] [Transformers-like API](./docs/source/3x/transformers_like_api.md) for INT4 inference on Intel CPU and GPU.
* [2024/07] From 3.0 release, framework extension API is recommended to be used for quantization.
* [2024/07] Performance optimizations and usability improvements on [client-side](./docs/source/3x/client_quant.md).
## Installation
Choose the necessary framework dependencies to install based on your deploy environment.
### Install Framework
* [Install intel_extension_for_pytorch for CPU](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)
* [Install intel_extension_for_pytorch for XPU](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)
* [Use Docker Image with torch installed for HPU](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#bare-metal-fresh-os-single-click)
**Note**: There is a version mapping between Intel Neural Compressor and Gaudi Software Stack, please refer to this [table](./docs/source/3x/gaudi_version_map.md) and make sure to use a matched combination.
* [Install torch for other platform](https://pytorch.org/get-started/locally)
* [Install TensorFlow](https://www.tensorflow.org/install)
### Install Neural Compressor from pypi
```
# Install 2.X API + Framework extension API + PyTorch dependency
pip install neural-compressor[pt]
# Install 2.X API + Framework extension API + TensorFlow dependency
pip install neural-compressor[tf]
```
**Note**: Further installation methods can be found under [Installation Guide](./docs/source/installation_guide.md). check out our [FAQ](./docs/source/faq.md) for more details.
## Getting Started
After successfully installing these packages, try your first quantization program. **Following example code demonstrates FP8 Quantization**, it is supported by Intel Gaudi2 AI Accelerator.
To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built).
Run a container with an interactive shell,
```
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest
```
Run the example,
```python
from neural_compressor.torch.quantization import (
FP8Config,
prepare,
convert,
)
import torch
import torchvision.models as models
model = models.resnet18()
qconfig = FP8Config(fp8_config="E4M3")
model = prepare(model, qconfig)
# Customer defined calibration. Below is a dummy calibration
model(torch.randn(1, 3, 224, 224).to("hpu"))
model = convert(model)
output = model(torch.randn(1, 3, 224, 224).to("hpu")).to("cpu")
print(output.shape)
```
More [FP8 quantization doc](./docs/source/3x/PT_FP8Quant.md).
**Following example code demonstrates weight-only large language model loading** on Intel Gaudi2 AI Accelerator.
```python
from neural_compressor.torch.quantization import load
model_name = "TheBloke/Llama-2-7B-GPTQ"
model = load(
model_name_or_path=model_name,
format="huggingface",
device="hpu",
torch_dtype=torch.bfloat16,
)
```
**Note:** Intel Neural Compressor will convert the model format from auto-gptq to hpu format on the first load and save hpu_model.safetensors to the local cache directory for the next load. So it may take a while to load for the first time.
## Documentation
<table class="docutils">
<thead>
<tr>
<th colspan="8">Overview</th>
</tr>
</thead>
<tbody>
<tr>
<td colspan="2" align="center"><a href="./docs/source/3x/design.md#architecture">Architecture</a></td>
<td colspan="2" align="center"><a href="./docs/source/3x/design.md#workflows">Workflow</a></td>
<td colspan="2" align="center"><a href="https://intel.github.io/neural-compressor/latest/docs/source/api-doc/apis.html">APIs</a></td>
<td colspan="1" align="center"><a href="./docs/source/3x/llm_recipes.md">LLMs Recipes</a></td>
<td colspan="1" align="center"><a href="./examples/3.x_api/README.md">Examples</a></td>
</tr>
</tbody>
<thead>
<tr>
<th colspan="8">PyTorch Extension APIs</th>
</tr>
</thead>
<tbody>
<tr>
<td colspan="2" align="center"><a href="./docs/source/3x/PyTorch.md">Overview</a></td>
<td colspan="2" align="center"><a href="./docs/source/3x/PT_DynamicQuant.md">Dynamic Quantization</a></td>
<td colspan="2" align="center"><a href="./docs/source/3x/PT_StaticQuant.md">Static Quantization</a></td>
<td colspan="2" align="center"><a href="./docs/source/3x/PT_SmoothQuant.md">Smooth Quantization</a></td>
</tr>
<tr>
<td colspan="2" align="center"><a href="./docs/source/3x/PT_WeightOnlyQuant.md">Weight-Only Quantization</a></td>
<td colspan="2" align="center"><a href="./docs/source/3x/PT_FP8Quant.md">FP8 Quantization</a></td>
<td colspan="2" align="center"><a href="./docs/source/3x/PT_MXQuant.md">MX Quantization</a></td>
<td colspan="2" align="center"><a href="./docs/source/3x/PT_MixedPrecision.md">Mixed Precision</a></td>
</tr>
</tbody>
<thead>
<tr>
<th colspan="8">Tensorflow Extension APIs</th>
</tr>
</thead>
<tbody>
<tr>
<td colspan="3" align="center"><a href="./docs/source/3x/TensorFlow.md">Overview</a></td>
<td colspan="3" align="center"><a href="./docs/source/3x/TF_Quant.md">Static Quantization</a></td>
<td colspan="2" align="center"><a href="./docs/source/3x/TF_SQ.md">Smooth Quantization</a></td>
</tr>
</tbody>
<thead>
<tr>
<th colspan="8">Transformers-like APIs</th>
</tr>
</thead>
<tbody>
<tr>
<td colspan="8" align="center"><a href="./docs/source/3x/transformers_like_api.md">Overview</a></td>
</tr>
</tbody>
<thead>
<tr>
<th colspan="8">Other Modules</th>
</tr>
</thead>
<tbody>
<tr>
<td colspan="4" align="center"><a href="./docs/source/3x/autotune.md">Auto Tune</a></td>
<td colspan="4" align="center"><a href="./docs/source/3x/benchmark.md">Benchmark</a></td>
</tr>
</tbody>
</table>
> **Note**:
> From 3.0 release, we recommend to use 3.X API. Compression techniques during training such as QAT, Pruning, Distillation only available in [2.X API](https://github.com/intel/neural-compressor/blob/master/docs/source/2x_user_guide.md) currently.
## Selected Publications/Events
* EMNLP'2024: [Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs](https://arxiv.org/abs/2309.05516) (Sep 2024)
* Blog on Medium: [Quantization on Intel Gaudi Series AI Accelerators](https://medium.com/intel-analytics-software/intel-neural-compressor-v3-0-a-quantization-tool-across-intel-hardware-9856adee6f11) (Aug 2024)
* Blog by Intel: [Neural Compressor: Boosting AI Model Efficiency](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/Neural-Compressor-Boosting-AI-Model-Efficiency/post/1604740) (June 2024)
* Blog by Intel: [Optimization of Intel AI Solutions for Alibaba Clouds Qwen2 Large Language Models](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-ai-solutions-accelerate-alibaba-qwen2-llms.html) (June 2024)
* Blog by Intel: [Accelerate Meta* Llama 3 with Intel AI Solutions](https://www.intel.com/content/www/us/en/developer/articles/technical/accelerate-meta-llama3-with-intel-ai-solutions.html) (Apr 2024)
* EMNLP'2023 (Under Review): [TEQ: Trainable Equivalent Transformation for Quantization of LLMs](https://openreview.net/forum?id=iaI8xEINAf&referrer=%5BAuthor%20Console%5D) (Sep 2023)
* arXiv: [Efficient Post-training Quantization with FP8 Formats](https://arxiv.org/abs/2309.14592) (Sep 2023)
* arXiv: [Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs](https://arxiv.org/abs/2309.05516) (Sep 2023)
> **Note**:
> View [Full Publication List](https://github.com/intel/neural-compressor/blob/master/docs/source/publication_list.md).
## Additional Content
* [Release Information](./docs/source/releases_info.md)
* [Contribution Guidelines](./docs/source/CONTRIBUTING.md)
* [Legal Information](./docs/source/legal_information.md)
* [Security Policy](SECURITY.md)
## Communication
- [GitHub Issues](https://github.com/intel/neural-compressor/issues): mainly for bug reports, new feature requests, question asking, etc.
- [Email](mailto:inc.maintainers@intel.com): welcome to raise any interesting research ideas on model compression techniques by email for collaborations.
- [Discord Channel](https://discord.com/invite/Wxk3J3ZJkU): join the discord channel for more flexible technical discussion.
- [WeChat group](/docs/source/imgs/wechat_group.jpg): scan the QA code to join the technical discussion.

View File

@@ -0,0 +1,13 @@
Security Policy
===============
## Report a Vulnerability
Please report security issues or vulnerabilities to the [Intel® Security Center].
For more information on how Intel® works to resolve security issues, see
[Vulnerability Handling Guidelines].
[Intel® Security Center]:https://www.intel.com/security
[Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html

View File

@@ -0,0 +1,47 @@
#
# Copyright (c) 2022 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG UBUNTU_VER=22.04
FROM ubuntu:${UBUNTU_VER} as deploy
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
ARG PYTHON=python3.8
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
${PYTHON}-dev \
gcc \
libgl1-mesa-glx \
libglib2.0-0 \
python3 \
python3-pip \
curl
RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
pip \
setuptools
RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
ln -sf $(which ${PYTHON}) /usr/bin/python && \
ln -sf $(which ${PYTHON}) /usr/bin/python3
ARG INC_VER=2.3
RUN curl https://raw.githubusercontent.com/intel/neural-compressor/v${INC_VER}/third-party-programs.txt -o /licenses/third-party-programs.txt && \
curl https://raw.githubusercontent.com/intel/neural-compressor/v${INC_VER}/docker/third-party-programs-docker.txt -o /licenses/third-party-programs-docker.txt && \
curl https://raw.githubusercontent.com/intel/neural-compressor/v${INC_VER}/LICENSE -o /licenses/LICENSE
RUN python -m pip install --no-cache-dir neural-compressor${INC_VER:+==${INC_VER}}

View File

@@ -0,0 +1,65 @@
#
# Copyright (c) 2022 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG UBUNTU_VER=20.04
FROM ubuntu:${UBUNTU_VER} as devel
# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
ARG PYTHON=python3.8
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
python3 \
python3-pip
RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
pip \
setuptools
RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
ln -sf $(which ${PYTHON}) /usr/bin/python && \
ln -sf $(which ${PYTHON}) /usr/bin/python3
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
${PYTHON}-dev \
${PYTHON}-distutils \
autoconf \
build-essential \
cmake \
g++ \
git \
libgl1-mesa-glx \
libglib2.0-0 \
curl
ARG INC_BRANCH=v2.4rc1
RUN git clone --single-branch --branch=${INC_BRANCH} https://github.com/intel/neural-compressor.git && \
cd neural-compressor && \
git submodule sync && \
git submodule update --init --recursive && \
python -m pip install --no-cache-dir pycocotools && \
python -m pip install --no-cache-dir -r requirements.txt && \
python setup.py install
WORKDIR /neural-compressor
ARG INC_VER=2.3
RUN curl https://raw.githubusercontent.com/intel/neural-compressor/v${INC_VER}/third-party-programs.txt -o /licenses/third-party-programs.txt && \
curl https://raw.githubusercontent.com/intel/neural-compressor/v${INC_VER}/docker/third-party-programs-docker.txt -o /licenses/third-party-programs-docker.txt && \
curl https://raw.githubusercontent.com/intel/neural-compressor/v${INC_VER}/LICENSE -o /licenses/LICENSE

View File

@@ -0,0 +1,34 @@
## Build Intel Neural Compressor Containers:
### To build the `Pip` based deployment container:
Please note that `INC_VER` must be set to a valid version published here:
https://pypi.org/project/neural-compressor/#history
```console
$ PYTHON=python3.10
$ INC_BRANCH=3.2
$ IMAGE_NAME=neural-compressor
$ IMAGE_TAG=${INC_VER}
$ docker build --build-arg PYTHON=${PYTHON} --build-arg INC_VER=${INC_VER} -f Dockerfile -t ${IMAGE_NAME}:${IMAGE_TAG} .
```
### To build the `Pip` based development container:
Please note that `INC_BRANCH` must be a set to a valid branch name otherwise, Docker build fails.
If `${INC_BRANCH}-devel` does not meet Docker tagging requirements described here:
https://docs.docker.com/engine/reference/commandline/tag/
then please modify the tag so that the tagging requirement is met. For example replace `/` with `-`.
```console
$ PYTHON=python3.10
$ INC_BRANCH=3.2
$ IMAGE_NAME=neural-compressor
$ IMAGE_TAG=${INC_BRANCH}-devel
$ docker build --build-arg PYTHON=${PYTHON} --build-arg INC_BRANCH=${INC_BRANCH} -f Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} .
```
### Check the Containers built:
```console
$ docker images | grep -i neural-compressor
neural-compressor v3.2-devel 5c0dc1371312 5 minutes ago 2.76GB
neural-compressor 3.2 303de7f7c38d 36 minutes ago 1.61GB
```

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

View File

@@ -0,0 +1,185 @@
#i!/bin/bash
help () {
echo ""
echo "Help:"
echo "$0 or $0 local"
echo " Build html for local test, not merge to gh-pages branch"
echo "$0 version"
echo " Build for version (version.py), then merge & push to gh-pages branch"
echo "$0 latest"
echo " Build for latest code, then merge & push to gh-pages branch"
}
if [ ! -n "$1" ]; then
ACT=only_build_local
else
if [ "$1" == "version" ]; then
ACT=build_version
elif [ "$1" == "latest" ]; then
ACT=build_latest
elif [ "$1" == "local" ]; then
ACT=only_build_local
elif [ "$1" == "help" ]; then
help
exit 0
else
echo "Wrong parameter \"$1\""
help
exit 1
fi
fi
echo "ACT is ${ACT}"
if [ ${ACT} == "only_build_local" ]; then
UPDATE_LATEST_FOLDER=1
UPDATE_VERSION_FOLDER=1
CHECKOUT_GH_PAGES=0
elif [ ${ACT} == "build_version" ]; then
UPDATE_LATEST_FOLDER=0
UPDATE_VERSION_FOLDER=1
CHECKOUT_GH_PAGES=1
elif [ ${ACT} == "build_latest" ]; then
UPDATE_LATEST_FOLDER=1
UPDATE_VERSION_FOLDER=0
CHECKOUT_GH_PAGES=1
fi
WORK_DIR=../../build_tmp
rm -rf /tmp/env_sphinx
if [ ! -d ${WORK_DIR} ]; then
echo "no ${WORK_DIR}"
else
if [ ! -d ${WORK_DIR}/env_sphinx ]; then
echo "no exist ${WORK_DIR}/env_sphinx"
else
cp -rf ${WORK_DIR}/env_sphinx /tmp/
rm -rf ${WORK_DIR}
echo "backup ${WORK_DIR}/env_sphinx to /tmp"
fi
fi
mkdir -p ${WORK_DIR}
cp -rf ./* ${WORK_DIR}
cd ${WORK_DIR}
if [ ! -d /tmp/env_sphinx ]; then
echo "no /tmp/env_sphinx"
else
echo "restore env_sphinx from /tmp"
cp -r /tmp/env_sphinx ./
fi
if [ ! -d env_sphinx ]; then
echo "create env_sphinx"
bash pip_set_env.sh
fi
source env_sphinx/bin/activate
cp -rf ../docs/ ./source
cp -f "../README.md" "./source/docs/source/Welcome.md"
cp -f "../SECURITY.md" "./source/docs/source/SECURITY.md"
all_md_files=`find ./source/docs -name "*.md"`
for md_file in ${all_md_files}
do
sed -i 's/.md/.html/g' ${md_file}
done
sed -i 's/.\/docs\/source\/_static/./g' ./source/docs/source/Welcome.md
sed -i 's/.md/.html/g; s/.\/docs\/source\//.\//g' ./source/docs/source/Welcome.md
#sed -i 's/\/examples\/README.html/https:\/\/github.com\/intel\/neural-compressor\/blob\/master\/examples\/README.md/g' ./source/docs/source/user_guide.md
sed -i 's/https\:\/\/intel.github.io\/neural-compressor\/lates.\/api-doc\/apis.html/https\:\/\/intel.github.io\/neural-compressor\/latest\/docs\/source\/api-doc\/apis.html/g' ./source/docs/source/Welcome.md
sed -i 's/\/examples\/pytorch/https:\/\/github.com\/intel\/neural-compressor\/blob\/master\/examples\/pytorch/g' ./source/docs/source/Welcome.md
sed -i 's/examples\/README.html/https:\/\/github.com\/intel\/neural-compressor\/blob\/master\/examples\/README.md/g' ./source/docs/source/Welcome.md
sed -i 's/\/examples\/README.md/https:\/\/github.com\/intel\/neural-compressor\/blob\/master\/examples\/README.md/g' ./source/docs/source/get_started.md
sed -i 's/.\/validated_model_list.md\#/.\/validated_model_list.html\#/g' ./source/docs/source/installation_guide.md
make clean
make html
if [[ $? -eq 0 ]]; then
echo "Sphinx build online documents successfully!"
else
echo "Sphinx build online documents fault!"
exit 1
fi
DRAFT_FOLDER=./draft
mkdir -p ${DRAFT_FOLDER}
VERSION=`cat source/version.txt`
DST_FOLDER=${DRAFT_FOLDER}/${VERSION}
LATEST_FOLDER=${DRAFT_FOLDER}/latest
SRC_FOLDER=build/html
RELEASE_FOLDER=./gh-pages
ROOT_DST_FOLDER=${RELEASE_FOLDER}/${VERSION}
ROOT_LATEST_FOLDER=${RELEASE_FOLDER}/latest
if [[ ${UPDATE_VERSION_FOLDER} -eq 1 ]]; then
echo "create ${DST_FOLDER}"
rm -rf ${DST_FOLDER}/*
mkdir -p ${DST_FOLDER}
cp -r ${SRC_FOLDER}/* ${DST_FOLDER}
python update_html.py ${DST_FOLDER} ${VERSION}
cp -r ./source/docs/source/imgs ${DST_FOLDER}/docs/source
cp -r ./source/docs/source/3x/imgs ${DST_FOLDER}/docs/source/3x
cp source/_static/index.html ${DST_FOLDER}
else
echo "skip to create ${DST_FOLDER}"
fi
if [[ ${UPDATE_LATEST_FOLDER} -eq 1 ]]; then
echo "create ${LATEST_FOLDER}"
rm -rf ${LATEST_FOLDER}/*
mkdir -p ${LATEST_FOLDER}
cp -r ${SRC_FOLDER}/* ${LATEST_FOLDER}
python update_html.py ${LATEST_FOLDER} ${VERSION}
cp -r ./source/docs/source/imgs ${LATEST_FOLDER}/docs/source
cp -r ./source/docs/source/3x/imgs ${LATEST_FOLDER}/docs/source/3x
cp source/_static/index.html ${LATEST_FOLDER}
else
echo "skip to create ${LATEST_FOLDER}"
fi
echo "Create document is done"
if [[ ${CHECKOUT_GH_PAGES} -eq 1 ]]; then
git clone -b gh-pages --single-branch https://github.com/intel/neural-compressor.git ${RELEASE_FOLDER}
if [[ ${UPDATE_VERSION_FOLDER} -eq 1 ]]; then
python update_version.py ${ROOT_DST_FOLDER} ${VERSION}
cp -rf ${DST_FOLDER} ${RELEASE_FOLDER}
fi
if [[ ${UPDATE_LATEST_FOLDER} -eq 1 ]]; then
cp -rf ${LATEST_FOLDER} ${RELEASE_FOLDER}
fi
else
echo "skip pull gh-pages"
fi
echo "UPDATE_LATEST_FOLDER=${UPDATE_LATEST_FOLDER}"
echo "UPDATE_VERSION_FOLDER=${UPDATE_VERSION_FOLDER}"
if [[ $? -eq 0 ]]; then
echo "create online documents successfully!"
else
echo "create online documents fault!"
exit 1
fi
exit 0

View File

@@ -0,0 +1,37 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
#set SPHINXBUILD=sphinx-multiversion
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
if "%1" == "" goto help
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

View File

@@ -0,0 +1,10 @@
#!/bin/bash
ENV_NAME=env_sphinx
deactivate
rm -rf $ENV_NAME
python -m venv $ENV_NAME
source $ENV_NAME/bin/activate
pip install --upgrade pip
pip install -r sphinx-requirements.txt

View File

@@ -0,0 +1,20 @@
/* make the page 1000px */
.wy-nav-content {
max-width: 1200px;
}
/* code block highlight color in rtd changed to lime green, no no no */
.rst-content tt.literal, .rst-content code.literal, .highlight {
background: #f0f0f0;
}
.rst-content tt.literal, .rst-content code.literal {
color: #000000;
}
div.version a:link {
color: #ffffff;
}
div.version a:visited {
color: #dddddd;
}

View File

@@ -0,0 +1 @@
<meta http-equiv="refresh" content="0; URL='./docs/source/Welcome.html'" />

View File

@@ -0,0 +1,3 @@
{% extends '!footer.html' %} {% block extrafooter %} {{super}}
<p></p><div><a href='https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html' data-cookie-notice='true'>Cookies</a> <a href='https://www.intel.com/content/www/us/en/privacy/intel-privacy-notice.html'>| Privacy</a></div>
{% endblock %}

View File

@@ -0,0 +1,16 @@
{%- extends "!layout.html" %}
{% block scripts %}
<script type="text/javascript">
// Configure TMS settings
window.wapProfile = 'profile-microsite'; // This is mapped by WAP authorize value
window.wapLocalCode = 'us-en'; // Dynamically set per localized site, see mapping table for values
window.wapSection = "neural-compressor"; // WAP team will give you a unique section for your site
window.wapEnv = 'prod'; // environment to be use in Adobe Tags.
// Load TMS
(() => {
let url = 'https://www.intel.com/content/dam/www/global/wap/main/wap-microsite.js';
let po = document.createElement('script'); po.type = 'text/javascript'; po.async = true; po.src = url;
let s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(po, s);
}) ();
</script>
{% endblock %}

View File

@@ -0,0 +1,91 @@
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
import os
import sys
sys.path.insert(0, os.path.abspath("../../neural_compressor/"))
import version as ver
version = ver.__version__
release = version
with open("version.txt", "w") as f:
f.write(version)
repo_url = "https://github.com/intel/neural-compressor/blob/v{}".format(version)
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = "Intel® Neural Compressor"
copyright = "2022, Intel® Neural Compressor, Intel"
author = "Intel® Neural Compressor developers"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
extensions = [
"recommonmark",
"sphinx_markdown_tables",
"sphinx.ext.coverage",
"sphinx.ext.autosummary",
"sphinx_md",
"sphinx_rtd_theme",
"autoapi.extension",
"sphinx.ext.napoleon",
"sphinx.ext.githubpages",
"sphinx.ext.linkcode",
"sphinxcontrib.jquery",
]
autoapi_dirs = ["../../neural_compressor"]
autoapi_root = "autoapi"
autoapi_keep_files = True
autoapi_add_toctree_entry = False
autosummary_generate = True
autoapi_options = ["members", "show-module-summary"]
autoapi_ignore = []
templates_path = ["_templates"]
source_suffix = [".rst", ".md"]
# The master toctree document.
master_doc = "index"
exclude_patterns = []
pygments_style = "sphinx"
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
# html_theme = 'alabaster'
html_theme = "sphinx_rtd_theme"
html_static_path = ["_static"]
templates_path = ["_templates"]
def skip_util_classes(app, what, name, obj, skip, options):
if what == "property" or what == "method":
skip = True
return skip
def setup(app):
app.add_css_file("custom.css")
app.connect("autoapi-skip-member", skip_util_classes)
def linkcode_resolve(domain, info):
if domain != "py":
return None
if not info["module"]:
return None
filename = info["module"].replace(".", "/")
return "{}/{}.py".format(repo_url, filename)

View File

@@ -0,0 +1,22 @@
Intel® Neural Compressor Documentation
######################################
Welcome to the project.
Sections
********
.. toctree::
:maxdepth: 1
docs/source/get_started.md
docs/source/installation_guide.md
docs/source/user_guide.md
docs/source/examples_readme.md
docs/source/api-doc/apis.rst
docs/source/releases_info.md
docs/source/contributions.md
docs/source/legal_information.md
docs/source/SECURITY.md
Repo <https://github.com/intel/neural-compressor>

View File

@@ -0,0 +1,10 @@
recommonmark==0.7.1
setuptools_scm[toml]==8.1.0
sphinx==7.3.7
sphinx-autoapi==3.1.0
sphinx-autobuild==2024.4.16
sphinx-markdown-tables==0.0.17
sphinx-md==0.0.4
sphinx_rtd_theme==2.0.0
sphinxcontrib-jquery==4.1
sphinxemoji==0.3.1

View File

@@ -0,0 +1,100 @@
import glob
import os
import sys
def find_index_path(index_file):
with open(index_file, "r") as f:
lines = f.readlines()
for line in lines:
pos = line.find('index.html" class="icon icon-home"')
if pos < 0:
continue
pos1 = line.rfind('"', 0, pos)
if pos1 < 0:
return ""
else:
return "../" + line[pos1 + 1 : pos]
return "ignore"
def update_version_link(version, folder_name, index_file):
index_buf = ""
index_path = find_index_path(index_file)
if index_path == "ignore":
return
with open(index_file, "r") as f:
index_buf = f.read()
key_str = ' <div class="version">\n {}\n </div>'.format(version)
version_list = """<div class="version">
<a href="{}versions.html">{}▼</a>
<p>Click link above to switch version</p>
</div>""".format(
index_path, folder_name
)
# print(index_buf.find(key_str))
index_buf = index_buf.replace(key_str, version_list)
# print(index_buf)
with open(index_file, "w") as f:
f.write(index_buf)
def update_source_url(version, folder_name, index_file):
if "latest" != folder_name:
return
base_url = 'class="reference external" href="https://github.com/intel/neural-compressor/blob/{}/'
repo_url = base_url.format("v" + version)
target = base_url.format("master")
with open(index_file, "r") as f:
index_buf = f.read()
index_buf = index_buf.replace(repo_url, target)
with open(index_file, "w") as f:
f.write(index_buf)
def update_search(folder):
search_file_name = "{}/search.html".format(folder)
with open(search_file_name, "r") as f:
index_buf = f.read()
key_str = '<script src="_static/searchtools.js"></script>'
version_list = """<!--[if lt IE 9]>
<script src="_static/js/html5shiv.min.js"></script>
<![endif]-->
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=fc837d61"></script>
<script src="_static/doctools.js?v=9a2dae69"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<script src="_static/searchtools.js"></script>"""
index_buf = index_buf.replace(key_str, version_list)
with open(search_file_name, "w") as f:
f.write(index_buf)
def main(folder, version):
folder_name = os.path.basename(folder)
for index_file in glob.glob("{}/**/*.html".format(folder), recursive=True):
update_version_link(version, folder_name, index_file)
update_source_url(version, folder_name, index_file)
update_search(folder)
def help(me):
print("python {} html_folder version".format(me))
if __name__ == "__main__":
if len(sys.argv) < 3:
help(sys.argv[0])
sys.exit(1)
folder = sys.argv[1]
version = sys.argv[2]
main(folder, version)

View File

@@ -0,0 +1,37 @@
import os
import sys
def main(folder, version):
folder_name = os.path.basename(folder)
version_file = "{}/versions.html".format(os.path.dirname(folder))
# print(version_file)
ver_buf = ""
with open(version_file, "r") as f:
ver_buf = f.read()
if ver_buf.find(version) >= 0:
return
key_str = '<li><a href="latest">latest</a></li>'
new_ver = """<li><a href="latest">latest</a></li>
<li><a href="{}">{}</a></li>""".format(
version, version
)
ver_buf = ver_buf.replace(key_str, new_ver)
with open(version_file, "w") as f:
f.write(ver_buf)
def help(me):
print("python {} html_folder version".format(me))
if __name__ == "__main__":
if len(sys.argv) < 3:
help(sys.argv[0])
sys.exit(1)
folder = sys.argv[1]
version = sys.argv[2]
main(folder, version)

View File

@@ -0,0 +1,76 @@
2.X API User Guide
===========================
Intel® Neural Compressor aims to provide popular model compression techniques such as quantization, pruning (sparsity), distillation, and neural architecture search to help the user optimize their model. The below documents could help you to get familiar with concepts and modules in Intel® Neural Compressor. Learn how to utilize the APIs in Intel® Neural Compressor to conduct quantization, pruning (sparsity), distillation, and neural architecture search on mainstream frameworks.
## Overview
This part helps user to get a quick understand about design structure and workflow of 2.X Intel® Neural Compressor. We provided broad examples to help users get started.
<table class="docutils">
<tbody>
<tr>
<td colspan="4" align="center"><a href="design.md#architecture">Architecture</a></td>
<td colspan="3" align="center"><a href="design.md#workflow">Workflow</a></td>
<td colspan="2" align="center"><a href="https://intel.github.io/neural-compressor/latest/docs/source/api-doc/apis.html">APIs</a></td>
</tr>
<tr>
<td colspan="2" align="center"><a href="/examples/README.md#notebook-examples">Notebook</a></td>
<td colspan="1" align="center"><a href="/examples/README.md">Examples</a></td>
<td colspan="1" align="center"><a href="validated_model_list.md">Results</a></td>
<td colspan="5" align="center"><a href="https://software.intel.com/content/www/us/en/develop/documentation/get-started-with-ai-linux/top.html">Intel oneAPI AI Analytics Toolkit</a></td>
</tr>
</tbody>
</table>
## Python-based APIs
Python-based APIs contains more details about the functional APIs in Intel® Neural Compressor,
which introduce the mechanism of each function and provides a tutorial to help the user apply in their own cases.
Please note that we will stop to support Intel Neural Compressor 1.X API in the future.
So we provide a comprehensive migration document in Code Migration to help the user update their code from previous 1.X version to the new 2.X version.
In 2.X API, it's very important to create the `DataLoader` and `Metrics` for your examples, so we provide the detail introductions.
<table class="docutils">
<tbody>
<tr>
<td colspan="2" align="center"><a href="quantization.md">Quantization</a></td>
<td colspan="3" align="center"><a href="mixed_precision.md">Advanced Mixed Precision</a></td>
<td colspan="2" align="center"><a href="pruning.md">Pruning (Sparsity)</a></td>
<td colspan="2" align="center"><a href="distillation.md">Distillation</a></td>
</tr>
<tr>
<td colspan="2" align="center"><a href="orchestration.md">Orchestration</a></td>
<td colspan="2" align="center"><a href="benchmark.md">Benchmarking</a></td>
<td colspan="3" align="center"><a href="distributed.md">Distributed Compression</a></td>
<td colspan="3" align="center"><a href="export.md">Model Export</a></td>
</tr>
<tr>
<td colspan="9" align="center"><a href="migration.md">Code Migration from Intel® Neural Compressor 1.X to Intel® Neural Compressor 2.X</a></td>
</tr>
<tr>
<td colspan="4" align="center"><a href="dataloader.md">DataLoader</a></td>
<td colspan="5" align="center"><a href="metric.md">Metric</a></td>
</tr>
</tbody>
</table>
## Advanced Topics
This part provides the advanced topics that help user dive deep into Intel® Neural Compressor 2.X API.
<table class="docutils">
<tbody>
<tr>
<td colspan="3" align="center"><a href="adaptor.md">Adaptor</a></td>
<td colspan="3" align="center"><a href="tuning_strategies.md">Strategy</a></td>
<td colspan="3" align="center"><a href="objective.md">Objective</a></td>
<td colspan="3" align="center"><a href="calibration.md">Calibration</a></td>
</tr>
<tr>
<td colspan="6" align="center"><a href="add_new_data_type.md">Add New Data Type</a></td>
<td colspan="6" align="center"><a href="add_new_adaptor.md">Add New Adaptor</a></td>
</tr>
<tr>
<td colspan="3" align="center"><a href="distillation_quantization.md">Distillation for Quantization</a></td>
<td colspan="3" align="center"><a href="smooth_quant.md">SmoothQuant</a></td>
<td colspan="3" align="center"><a href="quantization_weight_only.md">Weight-Only Quantization</a></td>
<td colspan="3" align="center"><a href="quantization_layer_wise.md">Layer-Wise Quantization</a></td>
</tr>
</tbody>
</table>

View File

@@ -0,0 +1,42 @@
Dynamic Quantization
===============
1. [Introduction](#introduction)
2. [Getting Started with Dynamic Quantization](#Getting-Started-with-Dynamic-Quantization)
3. [Examples](#examples)
## Introduction
Quantization is the process of converting floating point weights and activations to lower bitwidth tensors by multiplying the floating point values by a scale factor and rounding the results to whole numbers. Dynamic quantization determines the scale factor for activations dynamically based on the data range observed at runtime. We support W8A8 (quantizing weights and activations into 8 bits) dynamic quantization by leveraging torch's [`X86InductorQuantizer`](https://pytorch.org/tutorials/prototype/pt2e_quant_x86_inductor.html?highlight=x86inductorquantizer).
## Getting Started with Dynamic Quantization
There are four steps to perform W8A8 dynamic quantization: `export`, `prepare`, `convert` and `compile`.
```python
import torch
from neural_compressor.torch.export import export
from neural_compressor.torch.quantization import DynamicQuantConfig, prepare, convert
# Prepare the float model and example inputs for export model
model = UserFloatModel()
example_inputs = ...
# Export eager model into FX graph model
exported_model = export(model=model, example_inputs=example_inputs)
# Quantize the model
quant_config = DynamicQuantConfig()
prepared_model = prepare(exported_model, quant_config=quant_config)
q_model = convert(prepared_model)
# Compile the quantized model and replace the Q/DQ pattern with Q-operator
from torch._inductor import config
config.freezing = True
opt_model = torch.compile(q_model)
```
> Note: The `set_local` of `DynamicQuantConfig` will be supported after the torch 2.4 release.
## Examples
Example will be added later.

View File

@@ -0,0 +1,267 @@
FP8 Quantization
=======
1. [Introduction](#introduction)
2. [Supported Parameters](#supported-parameters)
3. [Get Start with FP8 Quantization](#get-start-with-fp8-quantization)
4. [Optimum-habana LLM example](#optimum-habana-LLM-example)
5. [VLLM example](#VLLM-example)
## Introduction
Float point 8 (FP8) is a promising data type for low precision quantization which provides a data distribution that is completely different from INT8 and it's shown as below.
<div align="center">
<img src="./imgs/fp8_dtype.png" height="250"/>
</div>
Intel Gaudi2, also known as HPU, provides this data type capability for low precision quantization, which includes `E4M3` and `E5M2`. For more information about these two data type, please refer to [link](https://arxiv.org/abs/2209.05433).
Intel Neural Compressor provides general quantization APIs to leverage HPU FP8 capability. with simple with lower memory usage and lower compute cost, 8 bit model
## Supported Parameters
<table class="tg"><thead>
<tr>
<th class="tg-fymr">Attribute</th>
<th class="tg-fymr">Description</th>
<th class="tg-fymr">Values</th>
</tr></thead>
<tbody>
<tr>
<td class="tg-0pky">fp8_config</td>
<td class="tg-0pky">The target data type of FP8 quantization.</td>
<td class="tg-0pky">E4M3 (default) - As Fig. 2<br>E5M2 - As Fig. 1.</td>
</tr>
<tr>
<td class="tg-0pky">hp_dtype</td>
<td class="tg-0pky">The high precision data type of non-FP8 operators.</td>
<td class="tg-0pky">bf16 (default) - torch.bfloat16<br>fp16 - torch.float16.<br>fp32 - torch.float32.</td>
</tr>
<tr>
<td class="tg-0pky">observer</td>
<td class="tg-0pky">The observer to measure the statistics.</td>
<td class="tg-0pky">maxabs (default), saves all tensors to files.</td>
</tr>
<tr>
<td class="tg-0pky">allowlist</td>
<td class="tg-0pky">List of nn.Module names or types to quantize. When setting an empty list, all the supported modules will be quantized by default. See Supported Modules. Not setting the list at all is not recommended as it will set the allowlist to these modules only: torch.nn.Linear, torch.nn.Conv2d, and BMM.</td>
<td class="tg-0pky">Default = {'names': [], 'types': <span title=["Matmul","Linear","FalconLinear","KVCache","Conv2d","LoRACompatibleLinear","LoRACompatibleConv","Softmax","ModuleFusedSDPA","LinearLayer","LinearAllreduce","ScopedLinearAllReduce","LmHeadLinearAllreduce"]>FP8_WHITE_LIST}</span></td>
</tr>
<tr>
<td class="tg-0pky">blocklist</td>
<td class="tg-0pky">List of nn.Module names or types not to quantize. Defaults to empty list, so you may omit it from the config file.</td>
<td class="tg-0pky">Default = {'names': [], 'types': ()}</td>
</tr>
<tr>
<td class="tg-0pky">mode</td>
<td class="tg-0pky">The mode, measure or quantize, to run HQT with.</td>
<td class="tg-0pky">MEASURE - Measure statistics of all modules and emit the results to dump_stats_path.<br>QUANTIZE - Quantize and run the model according to the provided measurements.<br>AUTO (default) - Select from [MEASURE, QUANTIZE] automatically.</td>
</tr>
<tr>
<td class="tg-0pky">dump_stats_path</td>
<td class="tg-0pky">The path to save and load the measurements. The path is created up until the level before last "/". The string after the last / will be used as prefix to all the measurement files that will be created.</td>
<td class="tg-0pky">Default = "./hqt_output/measure"</td>
</tr>
<tr>
<td class="tg-0pky">scale_method</td>
<td class="tg-0pky">The method for calculating the scale from the measurement.</td>
<td class="tg-0pky">- unit_scale - Always use scale of 1.<br>- hw_aligned_single_scale - Always use scale that's aligned to the corresponding HW accelerated scale.<br>- maxabs_hw (default) - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then aligned to the corresponding HW accelerated scale.<br>- maxabs_pow2 - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then rounded to the power of 2.<br>- maxabs_hw_opt_weight - Scale of model params (weights) is chosen as the scale that provides minimal mean-square-error between quantized and non-quantized weights, from all possible HW accelerated scales. Scale of activations is calculated the same as maxabs_hw.<br>- act_maxabs_pow2_weights_pcs_opt_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_hw_opt_weight. Scale of activations is calculated the same as maxabs_pow2.<br>- act_maxabs_hw_weights_pcs_maxabs_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_pow2. Scale of activations is calculated the same as maxabs_hw.</td>
</tr>
<tr>
<td class="tg-0pky">measure_exclude</td>
<td class="tg-0pky">If this attribute is not defined, the default is OUTPUT. Since most models do not require measuring output tensors, you can exclude it to speed up the measurement process.</td>
<td class="tg-0pky">NONE - All tensors are measured.<br>OUTPUT (default) - Excludes measurement of output tensors.</td>
</tr>
</tbody></table>
## Get Start with FP8 Quantization
[Demo Usage](https://github.com/intel/neural-compressor?tab=readme-ov-file#getting-started)
[Computer vision example](../../../examples/3.x_api/pytorch/cv/fp8_quant)
## Optimum-habana LLM example
### Overview
[Optimum](https://huggingface.co/docs/optimum) is an extension of Transformers that provides a set of performance optimization tools to train and run models on targeted hardware with maximum efficiency.
[Optimum-habana](https://github.com/huggingface/optimum-habana) is the interface between the Transformers, Diffusers libraries and Intel Gaudi AI Accelerators (HPU). It provides higher performance based on modified modeling files, and utilizes Intel Neural Compressor for FP8 quantization internally, [running-with-fp8](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8)
![](./imgs/optimum-habana.png)
### Installation
Refer to [optimum-habana, install-the-library-and-get-example-scripts](https://github.com/huggingface/optimum-habana?tab=readme-ov-file#install-the-library-and-get-example-scripts)
Option to install from source,
```
$ git clone https://github.com/huggingface/optimum-habana
$ cd optimum-habana && git checkout v1.14.0 (change the version)
$ pip install -e .
$ pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
$ cd examples/text-generation
$ pip install -r requirements.txt
$ pip install -r requirements_lm_eval.txt (Option)
```
### Check neural_compressor code
> optimum-habana/examples/text-generation/utils.py
>> initialize_model() -> setup_model() -> setup_quantization() -> FP8Config/prepare()/convert()
### FP8 KV cache
Introduction: [kv-cache-quantization in huggingface transformers](https://huggingface.co/blog/kv-cache-quantization)
BF16 KVCache Code -> [Modeling_all_models.py -> KVCache()](https://github.com/huggingface/optimum-habana/blob/main/optimum/habana/transformers/models/modeling_all_models.py)
FP8 KVCache code trace with neural compressor support, for example Llama models,
> optimum-habana/optimum/habana/transformers/models/llama/modeling_llama.py
>> GaudiLlamaForCausalLM() -> self.model()
>>> GaudiLlamaModel() -> forward() -> decoder_layer() -> GaudiLlamaDecoderLayer() forward() -> pre_attn() -> pre_attn_forward() -> self.k_cache.update
> neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
>> PatchedKVCache() -> update()
>> PatchedModuleFusedSDPA()
Models list which support FP8 KV Cache,
```
microsoft/Phi-3-mini-4k-instruct
bigcode/starcoder2-3b
Qwen/Qwen2.5-7B-Instruct|
meta-llama/Llama-3.2-3B-Instruct
tiiuae/falcon-7b-instruct
mistralai/Mixtral-8x7B-Instruct-v0.1
EleutherAI/gpt-j-6b
mistralai/Mistral-Nemo-Instruct-2407
...
```
### Running with FP8
Refer to [here](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8).
Change "--model_name_or_path" to be your model like
"meta-llama/Llama-3.1-8B-Instruct",
"Qwen/Qwen2.5-7B-Instruct", or
"mistralai/Mixtral-8x7B-Instruct-v0.1" and so on.
"--use_kv_cache" is to enable FP8 KV cache.
### Profiling
Add "--profiling_warmup_steps 5 --profiling_steps 2 --profiling_record_shapes" as args in the end of commandline of run_generation.py.
Refer to [torch.profiler.ProfilerActivity.HPU](https://github.com/huggingface/optimum-habana/blob/c9e1c23620618e2f260c92c46dfeb163545ec5ba/optimum/habana/utils.py#L305).
### FP8 Accuracy
"lm_eval.tasks", "lm_eval.evaluator", "lm_eval" are installed from the above requirements_lm_eval.txt. The tasks can be set and the default is ["hellaswag", "lambada_openai", "piqa", "winogrande"], [more info](https://github.com/EleutherAI/lm-evaluation-harness/)
| `Llama-2-7b-hf`| fp8 & fp8 KVCache| bf16 w/ bf16 KVCache|
|---------------|---------|--------|
| hellaswag | 0.5691097390957977 | 0.5704043019318861 |
| lambada_openai| 0.7360760721909567 | 0.7372404424607025 |
| piqa | 0.7850924918389554 | 0.7818280739934712 |
| winogrande | 0.6929755327545383 | 0.6929755327545383 |
| `Qwen2.5-7B-Instruct`| fp8 & fp8 KVCache| bf16 w/ bf16 KVCache|
|---------------|---------|--------|
| hellaswag | 0.2539334793865764 | 0.2539334793865764 |
| lambada_openai| 0.0 | 0.0 |
| piqa | 0.5391730141458106 | 0.5391730141458106 |
| winogrande | 0.4956590370955012 | 0.4956590370955012 |
| `Llama-3.1-8B-Instruct`| fp8 & fp8 KVCache| bf16 w/ bf16 KVCache|
|---------------|---------|--------|
| hellaswag | 0.5934076877116112 | 0.5975901214897431 |
| lambada_openai| 0.7230739375121289 | 0.7255967397632447 |
| piqa | 0.7932535364526659 | 0.8030467899891186 |
| winogrande | 0.7434885556432518 | 0.7371744277821626 |
| `Mixtral-8x7B-Instruct-v0.1`| fp8 & fp8 KVCache| bf16 w/ bf16 KVCache|
|---------------|---------|--------|
| hellaswag | 0.25323640709022105 | 0.25323640709022105 |
| lambada_openai| 0.0 | 0.0 |
| piqa | 0.528835690968444 | 0.528835690968444 |
| winogrande | 0.4956590370955012 | 0.4956590370955012 |
## VLLM example
### Overview
![](./imgs/vllm_gaudi.png)
### Installation
Refer to [Habana vllm-fork](https://github.com/HabanaAI/vllm-fork) to install.
Option to install `vllm-hpu-extension`, `neural_compressor` and `vllm` from the source,
```
$ git clone https://github.com/HabanaAI/vllm-fork.git
$ cd vllm-fork
$ pip install -r requirements-hpu.txt
$ python setup.py develop --user
## Check
$ pip list |grep vllm
vllm 0.6.3.dev1122+g2f43ebf5.d20241121.gaudi118 /home/fengding/vllm-fork
vllm-hpu-extension 0.1
## Validation
$ VLLM_SKIP_WARMUP=true python3 examples/offline_inference.py
......
Prompt: 'Hello, my name is', Generated text: ' Kelly and I have a job to do.\nI need someone to come over'
Prompt: 'The president of the United States is', Generated text: ' facing a sharp criticism of his handling of the coronavirus pandemic, including'
Prompt: 'The capital of France is', Generated text: ' the capital of the Socialist Party of France (SPF), with its state-'
Prompt: 'The future of AI is', Generated text: " in what's coming, not what's coming.\nI don't know what"
```
### Run FP8 calibration
Refer to [vllm-hpu-extension->calibration](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration)
```
$ git clone https://github.com/HabanaAI/vllm-hpu-extension
$ cd vllm-hpu-extension/calibration
# For Llama-3.1.8B-Instruct
$ ./calibrate_model.sh -m meta-llama/Llama-3.1-8B-Instruct -d /home/fengding/processed-data.pkl -o ./output_llama3.1.8b.Instruct -b 128 -t 1 -l 128
## Generate scale factors in ./output_llama3.1.8b.Instruct
```
### Start vllm server
```
$ cd vllm-fork/
$ PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
PT_HPU_WEIGHT_SHARING=0 \
VLLM_CONTIGUOUS_PA=true \
VLLM_SKIP_WARMUP=true \
QUANT_CONFIG=output_llama3.1.8b.Instruct/maxabs_quant_g2.json \
python3 -m vllm.entrypoints.openai.api_server \
--model meta-llama/Llama-3.1-8B-Instruct \
--port 8080 \
--gpu-memory-utilization 0.9 \
--tensor-parallel-size 1 \
--disable-log-requests \
--block-size 128 \
--quantization inc \
--kv-cache-dtype fp8_inc \
--device hpu \
--weights-load-device cpu \
--dtype bfloat16 \
--num_scheduler_steps 16 2>&1 > vllm_serving.log &
```
Refer to [vllm-fork->README_GAUDI.md](https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md) for more details.
### Start client to test
```
$ curl --noproxy "*" http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{"model": "meta-llama/Llama-3.1-8B-Instruct", "prompt": "San Francisco is a", "max_tokens": 100}'
```
### Run benchmark
```
python benchmarks/benchmark_serving.py \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet.txt \
--request-rate 128 \
--num-prompts 128 \
--port 8080 \
--sonnet-input-len 128 \
--sonnet-output-len 128 \
--sonnet-prefix-len 100
```
### FP8 KV cache
Code trace
> vllm-fork/vllm/attention/backends/hpu_attn.py
>> from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
>> HPUAttentionImpl() -> self.k_cache() / self.v_cache()
> neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
>> PatchedVLLMKVCache()
> neural_compressor/torch/algorithms/fp8_quant/common.py
>> "VLLMKVCache": ModuleInfo("kv_cache", PatchedVLLMKVCache)

View File

@@ -0,0 +1,107 @@
Microscaling Quantization
===============
1. [Introduction](#introduction)
2. [Get Started with Microscaling Quantization API](#get-start-with-microscaling-quantization-api)
3. [Examples](#examples)
4. [Reference](#reference)
## Introduction
Numerous breakthroughs have emerged across various fields, such as text analysis, language translation and chatbot technologies, fueled by the development of large language models (LLMs). Nevertheless, their increasing power comes with the challenge of explosive growth in parameters, posing obstacles for practical use. To balance memory limits and accuracy preservation for AI models, the Microscaling (MX) specification was promoted from the well-known Microsoft Floating Point (MSFP) data type [1, 2]:
<table>
<tr>
<th>Format Name</th>
<th>Element Data type</th>
<th>Element Bits</th>
<th>Scaling Block Size</th>
<th>Scale Data Type</th>
<th>Scale Bits</th>
</tr>
<tr>
<td rowspan="2">MXFP8</td>
<td>FP8 (E5M2)</td>
<td rowspan="2">8</td>
<td rowspan="2">32</td>
<td rowspan="2">E8M0</td>
<td rowspan="2">8</td>
</tr>
<tr>
<td>FP8 (E4M3)</td>
</tr>
<tr>
<td rowspan="2">MXFP6</td>
<td>FP6 (E3M2)</td>
<td rowspan="2">6</td>
<td rowspan="2">32</td>
<td rowspan="2">E8M0</td>
<td rowspan="2">8</td>
</tr>
<tr>
<td>FP6 (E2M3)</td>
</tr>
<tr>
<td>MXFP4</td>
<td>FP4 (E2M1)</td>
<td>4</td>
<td>32</td>
<td>E8M0</td>
<td>8</td>
</tr>
<tr>
<td>MXINT8</td>
<td>INT8</td>
<td>8</td>
<td>32</td>
<td>E8M0</td>
<td>8</td>
</tr>
</table>
At an equivalent accuracy level, the MX data type demonstrates the ability to occupy a smaller area and incur lower energy costs for multiply-accumulate compared to other conventional data types on the same silicon [1].
Neural Compressor seamlessly applies the MX data type to post-training quantization, offering meticulously crafted recipes to empower users to quantize LLMs without sacrificing accuracy. The workflow is shown as below.
<a target="_blank" href="./imgs/mx_workflow.png" text-align:left>
<left>
<img src="./imgs/mx_workflow.png" alt="Workflow of MX Quant (source [3])" height=120>
</left>
</a>
The memory and computational limits of LLMs are more severe than other general neural networks, so our exploration focuses on LLMs first. The following table shows the basic MX quantization recipes in Neural Compressor and enumerates distinctions among various data types. The MX data type replaces general float scale with powers of two to be more hardware-friendly. It adapts a granularity falling between per-channel and per-tensor to balance accuracy and memory consumption.
| | MX Format | INT8 | FP8 |
|------------|--------------|------------|------------|
| Scale | $2^{exp}$ | $\frac{MAX}{amax}$ | $\frac{MAX}{amax}$ |
| Zero point | 0 (None) | $2^{bits - 1}$ or $-min * scale$ | 0 (None) |
| Granularity | per-block (default blocksize is 32) | per-channel or per-tensor | per-channel or per-tensor |
The exponent (exp) is equal to torch.floor(torch.log2(amax)), MAX is the representation range of the data type, amax is the max absolute value of per-block tensor, and rmin is the minimum value of the per-block tensor.
## Get Started with Microscaling Quantization API
To get a model quantized with Microscaling Data Types, users can use the Microscaling Quantization API as follows.
```python
from neural_compressor.torch.quantization import MXQuantConfig, prepare, convert
quant_config = MXQuantConfig(w_dtype=args.w_dtype, act_dtype=args.act_dtype, weight_only=args.woq)
user_model = prepare(model=user_model, quant_config=quant_config)
user_model = convert(model=user_model)
```
## Examples
- PyTorch [huggingface models](/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant)
## Reference
[1]: Darvish Rouhani, Bita, et al. "Pushing the limits of narrow precision inferencing at cloud scale with microsoft floating point."Advances in neural information processing systems33 (2020): 10271-10281
[2]: OCP Microscaling Formats (MX) Specification
[3]: Rouhani, Bita Darvish, et al. "Microscaling Data Formats for Deep Learning."arXiv preprint arXiv:2310.10537(2023).

View File

@@ -0,0 +1,111 @@
PyTorch Mixed Precision
========================================
1. [Introduction](#introduction)
2. [Mixed Precision Support Matrix](#mixed-precision-support-matrix)
3. [Get Started](#get-start)
4. [Examples](#examples)
## Introduction
The recent growth of Deep Learning has driven the development of more complex models that require significantly more compute and memory capabilities. Several low precision numeric formats have been proposed to address the problem.
Google's [bfloat16](https://cloud.google.com/tpu/docs/bfloat16) and the [FP16: IEEE](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) half-precision format are two of the most widely used sixteen bit formats. [Mixed precision](https://arxiv.org/abs/1710.03740) training and inference using low precision formats have been developed to reduce compute and bandwidth requirements.
The 3rd Gen Intel® Xeon® Scalable processor (codenamed Cooper Lake), featuring Intel® Deep Learning Boost, is the first general-purpose x86 CPU to support the bfloat16 format. Specifically, three new bfloat16 instructions are added as a part of the AVX512_BF16 extension within Intel Deep Learning Boost: VCVTNE2PS2BF16, VCVTNEPS2BF16, and VDPBF16PS. The first two instructions allow converting to and from bfloat16 data type, while the last one performs a dot product of bfloat16 pairs.
Further details can be found in the [Hardware Numerics Document](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-deep-learning-boost-new-instruction-bfloat16.html) published by Intel.
The 4th Gen Intel® Xeon® Scalable processor supports FP16 instruction set architecture (ISA) for Intel® Advanced Vector Extensions 512 (Intel® AVX-512). The new ISA supports a wide range of general-purpose numeric operations for 16-bit half-precision IEEE-754 floating-point and complements the existing 32-bit and 64-bit floating-point instructions already available in the Intel Xeon processor based products.
Further details can be found in the [Intel AVX512 FP16 Guide](https://www.intel.com/content/www/us/en/content-details/669773/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide.html) published by Intel.
The latest Intel Xeon processors deliver flexibility of Intel Advanced Matrix Extensions (Intel AMX) ,an accelerator that improves the performance of deep learning(DL) training and inference, making it ideal for workloads like NLP, recommender systems, and image recognition. Developers can code AI functionality to take advantage of the Intel AMX instruction set, and they can code non-AI functionality to use the processor instruction set architecture (ISA). Intel has integrated the Intel® oneAPI Deep Neural Network Library (oneDNN), its oneAPI DL engine, into Pytorch.
Further details can be found in the [Intel AMX Document](https://www.intel.com/content/www/us/en/content-details/785250/accelerate-artificial-intelligence-ai-workloads-with-intel-advanced-matrix-extensions-intel-amx.html) published by Intel.
<p align="center" width="100%">
<img src="./imgs/data_format.png" alt="Architecture" height=230>
</p>
## Mixed Precision Support Matrix
<table class="center">
<thead>
<tr>
<th>Framework</th>
<th>Backend</th>
<th>Backend Library</th>
<th>Backend Value</th>
<th>Support Device(cpu as default)</th>
<th>Support BF16</th>
<th>Support FP16</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="1" align="left">PyTorch</td>
<td align="left">FX</td>
<td align="left">FBGEMM</td>
<td align="left">"default"</td>
<td align="left">cpu</td>
<td align="left">&#10004;</td>
<td align="left">&#10004;</td>
</tr>
</tbody>
</table>
### Hardware and Software requests for **BF16**
- PyTorch
1. Hardware: CPU supports `avx512_bf16` instruction set.
2. Software: torch >= [1.11.0](https://download.pytorch.org/whl/torch_stable.html).
### Hardware and Software requests for **FP16**
- PyTorch
1. Hardware: CPU supports `avx512_fp16` instruction set.
2. Software: torch >= [1.11.0](https://download.pytorch.org/whl/torch_stable.html).
> Note: To run FP16 on Intel-AMX, please set the environment variable `ONEDNN_MAX_CPU_ISA`:
> ```export ONEDNN_MAX_CPU_ISA=AVX512_CORE_AMX_FP16```
### Accuracy-driven mixed precision
BF16/FP16 conversion may lead to accuracy drop. Intel® Neural Compressor provides an accuracy-driven tuning function to reduce accuracy loss,
which could fallback converted ops to FP32, if set in config, to get better accuracy. To enable this function, users only to provide
`eval_fn` and `eval_args` for `autotune`.
To be noticed, IPEX backend doesn't support accuracy-driven mixed precision.
## Get Started with autotune API
To get a bf16/fp16 model, users can use the `autotune` interface with `MixedPrecisionConfig` as follows.
- BF16:
```python
from neural_compressor.torch.quantization import MixedPrecisionConfig, TuningConfig, autotune
def eval_acc_fn(model):
......
return acc
# modules might be fallback to fp32 to get better accuracy
custom_tune_config = TuningConfig(config_set=[MixedPrecisionConfig(dtype=["bf16", "fp32"])], max_trials=3)
best_model = autotune(model=build_torch_model(), tune_config=custom_tune_config, eval_fn=eval_acc_fn)
```
- FP16:
```python
from neural_compressor.torch.quantization import MixedPrecisionConfig, TuningConfig, autotune
def eval_acc_fn(model):
......
return acc
# modules might be fallback to fp32 to get better accuracy
custom_tune_config = TuningConfig(config_set=[MixedPrecisionConfig(dtype=["fp16", "fp32"])], max_trials=3)
best_model = autotune(model=build_torch_model(), tune_config=custom_tune_config, eval_fn=eval_acc_fn)
```
## Examples
Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/cv/mixed_precision
) on how to quantize a model with Mixed Precision.

View File

@@ -0,0 +1,112 @@
PyTorch Smooth Quantization
========================================
1. [Introduction](#Introduction)
2. [Usage](#Usage)
3. [Validated Models](#Validated-Models)
4. [Supported Framework Matrix](#Supported-Framework-Matrix)
## Introduction
Quantization is a common compression operation to reduce memory and accelerate inference by converting the floating point matrix to an integer matrix. For large language models (LLMs) with gigantic parameters, the systematic outliers make quantification of activations difficult. [SmoothQuant](https://arxiv.org/abs/2211.10438), a training free post-training quantization (PTQ) solution, offline migrates this difficulty from activations to weights with a mathematically equivalent transformation.
## Usage
### Fixed Alpha
To set a fixed alpha for the entire model, users can follow this example:
```python
from neural_compressor.torch.quantization import SmoothQuantConfig, convert, prepare
def run_fn(model):
model(example_inputs)
quant_config = SmoothQuantConfig(alpha=0.5)
prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(prepared_model)
q_model = convert(prepared_model)
```
`SmoothQuantConfig` description:
`alpha`: a smooth factor to calculate the conversion per-channel scale and balance the quantization difficulty of activation and weight. Float value, default is 0.5.
> **Note:** Alpha="auto" and alpha auto-tuning was supported in old API, please stay tuned for the new API's support for auto alpha.
### Specify Quantization Rules
Intel(R) Neural Compressor support specify quantization rules by operator type for Smooth Quantization. Users can use `set_local` to fallback op type in `SmoothQuantConfig` to achieve the above purpose.
Here we don't quantize `Linear` layers.
```python
# fallback by op_type
quant_config.set_local("Linear", SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32"))
prepared_model = prepare(model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(prepared_model)
q_model = convert(prepared_model)
```
To get more information, please refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant).
## Validated Models
Neural Compressor: 2.1
IPEX (Intel Extension for PyTorch): 2.0/2.1
Dataset: lambada_openai
Task: text-generation provided by [ITREX](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/huggingface/pytorch/text-generation/quantization)
alpha [0.4, 0.6] is sweet spot region in SmoothQuant paper.
A list of models that achieved a <1% accuracy drop is shown below.
| Model/Last token accuracy | FP32 Accuracy | INT8 (w/ SmoothQuant) | Notes |
|:----------:|:------:|:------:|-----------------------------------|
| bigscience/bloom-560m | 0.354 | 0.3542 | alpha=0.5, Ipex 2.1 |
| bigscience/bloom-1b7 | 0.4634 | 0.4936 | alpha=0.5, Ipex 2.0 |
| bigscience/bloom-3b | 0.518 | 0.5185 | alpha=0.8, Ipex 2.1 |
| bigscience/bloom-7b1 | 0.5764 | 0.5977 | alpha=0.5, Ipex 2.0 |
| bigscience/bloomz-560m | 0.3947 | 0.3930 | alpha=0.8, Ipex 2.1 |
| bigscience/bloomz-1b7 | 0.4828 | 0.4906 | alpha=0.5, Ipex 2.1 |
| bigscience/bloomz-3b | 0.5018 | 0.4980 | alpha=0.5, Ipex 2.1 |
| bigscience/bloomz-7b1 | 0.5593 | 0.5552 | alpha=0.5, Ipex 2.1 |
| facebook/opt-125m | 0.379 | 0.3757 | alpha=0.5, Ipex 2.1 |
| facebook/opt-350m | 0.4516 | 0.4533 | alpha=0.8, Ipex 2.1 |
| facebook/opt-1.3b | 0.5789 | 0.5742 | alpha=0.8, Ipex 2.0 |
| facebook/opt-2.7b | 0.6365 | 0.6404 | alpha=0.5, Ipex 2.0 |
| facebook/opt-6.7b | 0.6769 | 0.6804 | alpha=0.5, Ipex 2.0 |
| facebook/opt-13b | 0.6872 | 0.6814 | alpha=0.5, Ipex 2.1 |
| facebook/opt-30b | 0.7149 | 0.7128 | alpha=0.5, Ipex 2.1 |
| facebook/opt-66b | 0.7398 | 0.7326 | alpha=0.5, Ipex 2.1 |
| LLaMa-7b | 0.7361 | 0.7357 | alpha=0.8, Ipex 2.1 |
| LLaMa-13b | 0.7627 | 0.7590 | alpha=0.7, Ipex 2.1 |
| LLaMa-30b | 0.7759 | 0.7840 | alpha=0.7, Ipex 2.1 |
| LLaMa-65b | 0.7908 | 0.7957 | alpha=0.9, Ipex 2.1 |
| EleutherAI/gpt-j-6B* | 0.6831 | 0.6821 | alpha=1.0, Ipex 2.1 |
| MBZUAI/LaMini-GPT-124m | 0.3804 | 0.3887 | alpha=0.5, Ipex 2.1 |
| MBZUAI/LaMini-GPT-774m | 0.5048 | 0.5057 | alpha=0.5, Ipex 2.1 |
| MBZUAI/LaMini-GPT-1.5b | 0.5443 | 0.5436 | alpha=0.5, Ipex 2.1 |
| mosaicml/mpt-7b-chat | 0.655 | 0.6499 | alpha=0.7, Ipex 2.1 |
| stabilityai/stablelm-base-alpha-3b | 0.4172 | 0.4149 | alpha=0.6, Ipex 2.1 |
| togethercomputer/RedPajama-INCITE-Base-3B-v1 | 0.6542 | 0.6735 | alpha=0.5, Ipex 2.1 |
| togethercomputer/RedPajama-INCITE-Chat-3B-v1* | 0.6718 | 0.6740 | alpha=0.5, Ipex 2.0 |
| togethercomputer/RedPajama-INCITE-Instruct-3B-v1* | 0.6569 | 0.6621 | alpha=0.5, Ipex 2.0 |
| togethercomputer/RedPajama-INCITE-Base-7B-v0.1* | 0.7143 | 0.7221 | alpha=0.5, Ipex 2.0 |
| togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1* | 0.6895 | 0.6953 | alpha=0.5, Ipex 2.0 |
| databricks/dolly-v1-6b* | 0.6866 | 0.6895 | alpha=0.8, Ipex 2.1 |
| databricks/dolly-v2-3b* | 0.6297 | 0.6247 | alpha=0.5, Ipex 2.1 |
| tiiuae/falcon-7b-instruct | 0.6437 | 0.6392 | alpha=0.7, Pytorch |
Please refer to the step-by-step [instruction](../../examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md) for details.
Please note that for models with asterisk(*), we have set all add ops to FP32 during quantization step to achieve desirable results.
## Supported Framework Matrix
| Framework | Alpha | Folding |
|:---------:|--------------|------------|
| PyTorch | [0-1] | False |
| IPEX | [0-1] | True / False(Version>2.1) |

View File

@@ -0,0 +1,108 @@
PyTorch Static Quantization
========================================
1. [Introduction](#introduction)
2. [Get Started](#get-started) \
2.1 [Static Quantization with IPEX Backend](#static-quantization-with-ipex-backend) \
2.1.1 [Usage Sample with IPEX](#usage-sample-with-ipex) \
2.1.2 [Specify Quantization Rules](#specify-quantization-rules) \
2.1.3 [Model Examples](#model-examples) \
2.2 [Static Quantization with PT2E Backend](#static-quantization-with-pt2e-backend) \
2.2.1 [Usage Sample with PT2E](#usage-sample-with-pt2e)
2.2.2 [Model Examples with PT2E](#model-examples-with-pt2e)
## Introduction
Post-Training Quantization (PTQ) is a technique used to convert a pre-trained floating-point model to a quantized model. This approach does not require model retraining. Instead, it uses calibration data to determine the optimal quantization parameters. Static quantization involves calibrating both weights and activations during the quantization process. Currently, we support two paths to perform static PTQ: [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) and [PyTorch 2 Export Quantization (PT2E)](https://pytorch.org/tutorials/prototype/pt2e_quant_x86_inductor.html).
## Get Started
### Static Quantization with IPEX Backend
Intel Extension for PyTorch (IPEX) provides optimizations specifically for Intel hardware, improving the performance of PyTorch models through efficient execution on CPUs. IPEX supports PTQ, allowing users to quantize models to lower precision to reduce model size and inference time while maintaining accuracy.
The design philosophy of the quantization interface of Intel(R) Neural Compressor is easy-of-use. It requests user to provide `model`, `calibration function`, and `example inputs`. Those parameters would be used to quantize and tune the model.
`model` is the framework model location or the framework model object.
`calibration function` is used to determine the appropriate quantization parameters, such as `scale` and `zero-point`, for the model's weights and activations. This process is crucial for minimizing the loss of accuracy that can occur when converting from floating-point to lower-precision format.
IPEX leverages just-in-time (JIT) compilation techniques for optimizing the model. `example inputs` is used to trace the computational graph of the model, enabling various optimizations and transformations that are specific to IPEX. This tracing process captures the operations performed by the model, allowing IPEX to apply quantization optimizations effectively. `example inputs` should be representative of the actual data the model will process to ensure accurate calibration.
#### Usage Sample with IPEX
```python
import intel_extension_for_pytorch as ipex
from neural_compressor.torch.quantization import StaticQuantConfig, convert, prepare
quant_config = StaticQuantConfig(act_sym=True, act_algo="minmax")
prepared_model = prepare(model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(prepared_model)
q_model = convert(prepared_model)
```
> [!IMPORTANT]
> To use static quantization with the IPEX backend, please explicitly import IPEX at the beginning of your program.
#### Specify Quantization Rules
Intel(R) Neural Compressor support specify quantization rules by operator name or operator type. Users can use `set_local` to fallback either `op_name` or `op_type` in `StaticQuantConfig` to achieve the above purpose.
1. Example of `op_name_dict`
Here we don't quantize the layer named `fc1`.
```python
# fallback by op_name
quant_config.set_local("fc1", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(prepared_model)
q_model = convert(prepared_model)
```
2. Example of `op_type_dict`
Here we don't quantize `Linear` layers.
```python
# fallback by op_type
quant_config.set_local("Linear", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
prepared_model = prepare(model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(prepared_model)
q_model = convert(prepared_model)
```
#### Model Examples
Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex) on how to quantize a new model.
### Static Quantization with PT2E Backend
Compared to the IPEX backend, which uses JIT compilation to capture the eager model, the PT2E path uses `torch.dynamo` to capture the eager model into an FX graph model, and then inserts the observers and Q/QD pairs on it. Finally it uses the `torch.compile` to perform the pattern matching and replace the Q/DQ pairs with optimized quantized operators.
#### Usage Sample with PT2E
There are four steps to perform W8A8 static quantization with PT2E backend: `export`, `prepare`, `convert` and `compile`.
```python
import torch
from neural_compressor.torch.export import export
from neural_compressor.torch.quantization import StaticQuantConfig, prepare, convert
# Prepare the float model and example inputs for export model
model = UserFloatModel()
example_inputs = ...
# Export eager model into FX graph model
exported_model = export(model=model, example_inputs=example_inputs)
# Quantize the model
quant_config = StaticQuantConfig()
prepared_model = prepare(exported_model, quant_config=quant_config)
# Calibrate
run_fn(prepared_model)
q_model = convert(prepared_model)
# Compile the quantized model and replace the Q/DQ pattern with Q-operator
from torch._inductor import config
config.freezing = True
opt_model = torch.compile(q_model)
```
> Note: The `set_local` of `StaticQuantConfig` will be supported after the torch 2.4 release.
#### Model Examples with PT2E
Users could refer to [cv examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/cv/static_quant) and [llm examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e) on how to quantize a new model.

View File

@@ -0,0 +1,329 @@
PyTorch Weight Only Quantization
===============
- [Introduction](#introduction)
- [Supported Matrix](#supported-matrix)
- [Usage](#usage)
- [Get Started](#get-started)
- [Common arguments](#common-arguments)
- [RTN](#rtn)
- [GPTQ](#gptq)
- [AutoRound](#autoround)
- [AWQ](#awq)
- [TEQ](#teq)
- [HQQ](#hqq)
- [Specify Quantization Rules](#specify-quantization-rules)
- [Saving and Loading](#saving-and-loading)
- [Layer Wise Quantization](#layer-wise-quantization)
- [Efficient Usage on Client-Side](#efficient-usage-on-client-side)
- [Examples](#examples)
## Introduction
As large language models (LLMs) become more prevalent, there is a growing need for new and improved quantization methods that can meet the computational demands of these modern architectures while maintaining the accuracy. Compared to normal quantization like W8A8, weight only quantization is probably a better trade-off to balance the performance and the accuracy, since we will see below that the bottleneck of deploying LLMs is the memory bandwidth and normally weight only quantization could lead to better accuracy.
Model inference: Roughly speaking , two key steps are required to get the model's result. The first one is moving the model from the memory to the cache piece by piece, in which, memory bandwidth $B$ and parameter count $P$ are the key factors, theoretically the time cost is $P*4 /B$. The second one is computation, in which, the device's computation capacity $C$ measured in FLOPS and the forward FLOPs $F$ play the key roles, theoretically the cost is $F/C$.
Text generation: The most famous application of LLMs is text generation, which predicts the next token/word based on the inputs/context. To generate a sequence of texts, we need to predict them one by one. In this scenario, $F\approx P$ if some operations like bmm are ignored and past key values have been saved. However, the $C/B$ of the modern device could be to **100X,** that makes the memory bandwidth as the bottleneck in this scenario.
Besides, as mentioned in many papers[1][2], activation quantization is the main reason to cause the accuracy drop. So for text generation task, weight only quantization is a preferred option in most cases.
Theoretically, round-to-nearest (RTN) is the most straightforward way to quantize weight using scale maps. However, when the number of bits is small (e.g. 3), the MSE loss is larger than expected. A group size is introduced to reduce elements using the same scale to improve accuracy.
## Supported Matrix
| Algorithms/Backend | PyTorch eager mode |
|--------------|----------|
| RTN | &#10004; |
| GPTQ | &#10004; |
| AutoRound| &#10004; |
| AWQ | &#10004; |
| TEQ | &#10004; |
| HQQ | &#10004; |
> **RTN:** A quantification method that we can think of very intuitively. It does not require additional datasets and is a very fast quantization method. Generally speaking, RTN will convert the weight into a uniformly distributed integer data type, but some algorithms, such as Qlora, propose a non-uniform NF4 data type and prove its theoretical optimality.
> **GPTQ:** A new one-shot weight quantization method based on approximate second-order information, that is both highly-accurate and highly efficient[4]. The weights of each column are updated based on the fixed-scale pseudo-quantization error and the inverse of the Hessian matrix calculated from the activations. The updated columns sharing the same scale may generate a new max/min value, so the scale needs to be saved for restoration.
> **AutoRound:** AutoRound is an advanced weight-only quantization algorithm for low-bits LLM inference. It's tailored for a wide range of models and consistently delivers noticeable improvements, often significantly outperforming SignRound[5] with the cost of more tuning time for quantization.
> **AWQ:** Proved that protecting only 1% of salient weights can greatly reduce quantization error. the salient weight channels are selected by observing the distribution of activation and weight per channel. The salient weights are also quantized after multiplying a big scale factor before quantization for preserving.
> **TEQ:** A trainable equivalent transformation that preserves the FP32 precision in weight-only quantization. It is inspired by AWQ while providing a new solution to search for the optimal per-channel scaling factor between activations and weights.
> **HQQ:** The HQQ[6] method focuses specifically on minimizing errors in the weights rather than the layer activation. Additionally, by incorporating a sparsity-promoting loss, such as the $l_{p<1}$-norm, we effectively model outliers through a hyper-Laplacian distribution. This distribution more accurately captures the heavy-tailed nature of outlier errors compared to the squared error, resulting in a more nuanced representation of error distribution.
## Usage
### Get Started
WeightOnlyQuant quantization for PyTorch is using prepare and convert [APIs](./PyTorch.md#quantization-apis).
#### Common arguments
| Config | Capability |
|---|---|
| dtype (str)| ['int', 'nf4', 'fp4'] |
| bits (int)| [1, ..., 8] |
| group_size (int)| [-1, 1, ..., $C_{in}$] |
| use_sym (bool)| [True, False] |
| quant_lm_head (bool)| [False, True] |
| use_double_quant (bool) | [True, False] |
| double_quant_dtype (str) | ['int'] |
| double_quant_bits (int) | [1, ..., bits] |
| double_quant_use_sym (bool) | [True, False] |
| double_quant_group_size (int) | [-1, 1, ..., $C_{in}$] |
Notes:
- *group_size = -1* refers to **per output channel quantization**. Taking a linear layer (input channel = $C_{in}$, output channel = $C_{out}$) for instance, when *group size = -1*, quantization will calculate total $C_{out}$ quantization parameters. Otherwise, when *group_size = gs* quantization parameters are calculate with every $gs$ elements along with the input channel, leading to total $C_{out} \times (C_{in} / gs)$ quantization parameters.
- 4-bit NormalFloat(NF4) is proposed in QLoRA[7]. 'fp4' includes [fp4_e2m1](../../neural_compressor/adaptor/torch_utils/weight_only.py#L37) and [fp4_e2m1_bnb](https://github.com/TimDettmers/bitsandbytes/blob/18e827d666fa2b70a12d539ccedc17aa51b2c97c/bitsandbytes/functional.py#L735). By default, fp4 refers to fp4_e2m1_bnb.
- *quant_lm_head* defaults to False. This means that, except for transformer blocks, the last layer in transformer models will not be quantized by default. The last layer may be named "lm_head", "output_layer" or "embed_out".
- Only RTN and GPTQ support double quant.
#### RTN
| rtn_args | comments | default value |
|----------|-------------|-------------------------------------------------------------------|
| group_dim (int) | Dimension for grouping | 1 |
| use_full_range (bool) | Enables full range for activations | False |
| use_mse_search (bool) | Enables mean squared error (MSE) search | False |
| use_layer_wise (bool) | Enables quantize model per layer | False |
| model_path (str) | Model path that is used to load state_dict per layer | |
> **Notes:** `model_path` is only used when use_layer_wise=True. `layer-wise` is stay-tuned.
``` python
# Quantization code
from neural_compressor.torch.quantization import prepare, convert, RTNConfig
quant_config = RTNConfig()
model = prepare(model, quant_config)
model = convert(model)
```
#### GPTQ
| gptq_args | comments | default value |
|----------|-------------|-------------------------------------------------------------------|
| use_mse_search (bool) | Enables mean squared error (MSE) search | False
| use_layer_wise (bool) | Enables quantize model per layer | False |
| model_path (str) | Model path that is used to load state_dict per layer | |
| use_double_quant (bool) | Enables double quantization | False |
| act_order (bool) | Whether to sort Hessian's diagonal values to rearrange channel-wise quantization order | False |
| percdamp (float) | Percentage of Hessian's diagonal values' average, which will be added to Hessian's diagonal to increase numerical stability | 0.01 |
| block_size (int) | Execute GPTQ quantization per block, block shape = [C_out, block_size] | 128 |
| static_groups (bool) | Whether to calculate group wise quantization parameters in advance. This option mitigate actorder's extra computational requirements. | False |
| true_sequential (bool) | Whether to quantize layers within a transformer block in their original order. This can lead to higher accuracy but slower overall quantization process. | False |
> **Note:** `model_path` is only used when use_layer_wise=True. `layer-wise` is stay-tuned.
``` python
# Quantization code
from neural_compressor.torch.quantization import prepare, convert, GPTQConfig
quant_config = GPTQConfig()
model = prepare(model, quant_config)
run_fn(model) # calibration
model = convert(model)
```
#### AutoRound
| autoround_args | comments | default value |
|----------|-------------|-------------------------------------------------------------------|
| enable_full_range (bool) | Whether to enable full range quantization | False
| batch_size (int) | Batch size for training | 8 |
| lr_scheduler | The learning rate scheduler to be used | None |
| enable_quanted_input (bool) | Whether to use quantized input data | True |
| enable_minmax_tuning (bool) | Whether to enable min-max tuning | True |
| lr (float) | The learning rate | 0 |
| minmax_lr (float) | The learning rate for min-max tuning | None |
| low_gpu_mem_usage (bool) | Whether to use low GPU memory | True |
| iters (int) | Number of iterations | 200 |
| seqlen (int) | Length of the sequence | 2048 |
| n_samples (int) | Number of samples | 512 |
| sampler (str) | The sampling method | "rand" |
| seed (int) | The random seed | 42 |
| n_blocks (int) | Number of blocks | 1 |
| gradient_accumulate_steps (int) | Number of gradient accumulation steps | 1 |
| not_use_best_mse (bool) | Whether to use mean squared error | False |
| dynamic_max_gap (int) | The dynamic maximum gap | -1 |
| scale_dtype (str) | The data type of quantization scale to be used, different kernels have different choices | "float16" |
``` python
# Quantization code
from neural_compressor.torch.quantization import prepare, convert, AutoRoundConfig
quant_config = AutoRoundConfig()
model = prepare(model, quant_config)
run_fn(model) # calibration
model = convert(model)
```
#### AWQ
| awq_args | comments | default value |
|----------|-------------|-------------------------------------------------------------------|
| group_dim (int) | Dimension for grouping | 1 |
| use_full_range (bool) | Enables full range for activations | False |
| use_mse_search (bool) | Enables mean squared error (MSE) search | False |
| use_layer_wise (bool) | Enables quantize model per layer | False |
| use_auto_scale (bool) | Enables best scales search based on activation distribution | True |
| use_auto_clip (bool) | Enables clip range search | True |
| folding(bool) | Allow insert mul before linear when the scale cannot be absorbed by last layer | False. |
> **Notes:** `layer-wise` is stay-tuned.
``` python
# Quantization code
from neural_compressor.torch.quantization import prepare, convert, AWQConfig
quant_config = AWQConfig()
model = prepare(model, quant_config, example_inputs=example_inputs)
run_fn(model) # calibration
model = convert(model)
```
#### TEQ
| teq_args | comments | default value |
|----------|-------------|-------------------------------------------------------------------|
| group_dim (int) | Dimension for grouping | 1 |
| use_full_range (bool) | Enables full range for activations | False |
| use_mse_search (bool) | Enables mean squared error (MSE) search | False |
| use_layer_wise (bool) | Enables quantize model per layer | False |
| use_double_quant (bool) | Enables double quantization | False |
| folding(bool) | Allow insert mul before linear when the scale cannot be absorbed by last layer | False |
> **Notes:** `layer-wise` is stay-tuned.
``` python
# Quantization code
from neural_compressor.torch.quantization import prepare, convert, TEQConfig
quant_config = TEQConfig()
model = prepare(model, quant_config, example_inputs=example_inputs)
train_fn(model) # calibration
model = convert(model)
```
#### HQQ
| hqq_args | comments | default value |
|----------|-------------|-------------------------------------------------------------------|
| quant_zero (bool) | Whether to quantize zero point | True |
| quant_scale: (bool) | Whether to quantize scale: point | False |
| scale_quant_group_size (int) | The group size for quantizing scale | 128 |
``` python
# Quantization code
from neural_compressor.torch.quantization import prepare, convert, HQQConfig
quant_config = HQQConfig()
model = prepare(model, quant_config)
run_fn(model) # calibration
model = convert(model)
```
### Specify Quantization Rules
Intel(R) Neural Compressor support specify quantization rules by operator name or operator type. Users can set `local` in dict or use `set_local` method of config class to achieve the above purpose.
1. Example of setting `local` from a dict
```python
quant_config = {
"rtn": {
"global": {
"dtype": "int",
"bits": 4,
"group_size": -1,
"use_sym": True,
},
"local": {
"lm_head": {
"dtype": "fp32",
},
},
}
}
```
2. Example of using `set_local`
```python
quant_config = RTNConfig()
lm_head_config = RTNConfig(dtype="fp32")
quant_config.set_local("lm_head", lm_head_config)
```
### Saving and Loading
The saved_results folder contains two files: quantized_model.pt and qconfig.json, and the generated model is a quantized model. The quantitative model will include WeightOnlyLinear. To support low memory inference, Intel(R) Neural Compressor implemented WeightOnlyLinear, a torch.nn.Module, to compress the fake quantized fp32 model. Since torch does not provide flexible data type storage, WeightOnlyLinear combines low bits data into a long date type, such as torch.int8 and torch.int32. Low bits data includes weights and zero points. When using WeightOnlyLinear for inference, it will restore the compressed data to float32 and run torch linear function.
```python
# Quantization code
from neural_compressor.torch.quantization import prepare, convert, RTNConfig
quant_config = RTNConfig()
model = prepare(model, quant_config)
model = convert(model)
# save
model.save("saved_results")
# load
from neural_compressor.torch.quantization import load
orig_model = YOURMODEL()
loaded_model = load(
"saved_results", original_model=orig_model
) # Please note that the original_model parameter passes the original model.
```
## Layer Wise Quantization
As the size of LLMs continues to grow, loading the entire model into a single GPU card or the RAM of a client machine becomes impractical. To address this challenge, we introduce Layer-wise Quantization (LWQ), a method that quantizes LLMs layer by layer or block by block. This approach significantly reduces memory consumption. The diagram below illustrates the LWQ process.
<img src="./imgs/lwq.png" width=780 height=429>
*Figure 1: The process of layer-wise quantization for PyTorch model. The color grey means empty parameters and the color blue represents parameters need to be quantized. Every rectangle inside model represents one layer.*
Currently, we support LWQ for `RTN`, `AutoRound`, and `GPTQ`.
Here, we take the `RTN` algorithm as example to demonstrate the usage of LWQ.
```python
from neural_compressor.torch.quantization import RTNConfig, convert, prepare
from neural_compressor.torch import load_empty_model
model_state_dict_path = "/path/to/model/state/dict"
float_model = load_empty_model(model_state_dict_path)
quant_config = RTNConfig(use_layer_wise=True)
prepared_model = prepare(float_model, quant_config)
quantized_model = convert(prepared_model)
```
## Efficient Usage on Client-Side
For client machines with limited RAM and cores, we offer optimizations to reduce computational overhead and minimize memory usage. For detailed information, please refer to [Quantization on Client](https://github.com/intel/neural-compressor/blob/master/docs/source/3x/client_quant.md).
## Examples
Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only) on how to quantize a model with WeightOnlyQuant.
## Reference
[1]. Xiao, Guangxuan, et al. "Smoothquant: Accurate and efficient post-training quantization for large language models." arXiv preprint arXiv:2211.10438 (2022).
[2]. Wei, Xiuying, et al. "Outlier suppression: Pushing the limit of low-bit transformer language models." arXiv preprint arXiv:2209.13325 (2022).
[3]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
[4]. Frantar, Elias, et al. "Gptq: Accurate post-training quantization for generative pre-trained transformers." arXiv preprint arXiv:2210.17323 (2022).
[5]. Cheng, Wenhua, et al. "Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs" arXiv preprint arXiv:2309.05516 (2023).
[6]. Badri, Hicham and Shaji, Appu. "Half-Quadratic Quantization of Large Machine Learning Models." [Online] Available: <https://mobiusml.github.io/hqq_blog/> (2023).
[7]. Dettmers, Tim, et al. "Qlora: Efficient finetuning of quantized llms." arXiv preprint arXiv:2305.14314 (2023).

View File

@@ -0,0 +1,278 @@
Torch
=================================================
1. [Introduction](#introduction)
2. [Torch-like APIs](#torch-like-apis)
3. [Support matrix](#supported-matrix)
4. [Common Problems](#common-problems)
## Introduction
`neural_compressor.torch` provides a Torch-like API and integrates various model compression methods fine-grained to the torch.nn.Module. Supports a comprehensive range of models, including but not limited to CV models, NLP models, and large language models. A variety of quantization methods are available, including classic INT8 quantization, SmoothQuant, and the popular weight-only quantization. Neural compressor also provides the latest research in simulation work, such as FP8 emulation quantization, MX data type emulation quantization.
In terms of ease of use, neural compressor is committed to providing an easy-to-use user interface and easy to extend the structure design, on the one hand, reuse the PyTorch prepare, convert API, on the other hand, through the Quantizer base class for prepare and convert customization to provide a convenient.
For more details, please refer to [link](https://github.com/intel/neural-compressor/discussions/1527) in Neural Compressor discussion space.
So far, `neural_compressor.torch` still relies on the backend to generate the quantized model and run it on the corresponding backend, but in the future, neural_compressor is planned to provide generalized device-agnostic Q-DQ model, so as to achieve one-time quantization and arbitrary deployment.
## Torch-like APIs
Currently, we provide below three user scenarios, through `prepare`&`convert`, `autotune` and `load` APIs.
- One-time quantization of the model
- Get the best quantized model by setting the search scope and target
- Direct deployment of the quantized model
### Quantization APIs
```python
def prepare(
model: torch.nn.Module,
quant_config: BaseConfig,
inplace: bool = True,
example_inputs: Any = None,
):
"""Prepare the model for calibration.
Insert observers into the model so that it can monitor the input and output tensors during calibration.
Args:
model (torch.nn.Module): origin model
quant_config (BaseConfig): path to quantization config
inplace (bool, optional): It will change the given model in-place if True.
example_inputs (tensor/tuple/dict, optional): used to trace torch model.
Returns:
prepared and calibrated module.
"""
```
```python
def convert(
model: torch.nn.Module,
quant_config: BaseConfig = None,
inplace: bool = True,
):
"""Convert the prepared model to a quantized model.
Args:
model (torch.nn.Module): the prepared model
quant_config (BaseConfig, optional): path to quantization config, for special usage.
inplace (bool, optional): It will change the given model in-place if True.
Returns:
The quantized model.
"""
```
### Autotune API
```python
def autotune(
model: torch.nn.Module,
tune_config: TuningConfig,
eval_fn: Callable,
eval_args=None,
run_fn=None,
run_args=None,
example_inputs=None,
):
"""The main entry of auto-tune.
Args:
model (torch.nn.Module): _description_
tune_config (TuningConfig): _description_
eval_fn (Callable): for evaluation of quantized models.
eval_args (tuple, optional): arguments used by eval_fn. Defaults to None.
run_fn (Callable, optional): for calibration to quantize model. Defaults to None.
run_args (tuple, optional): arguments used by run_fn. Defaults to None.
example_inputs (tensor/tuple/dict, optional): used to trace torch model. Defaults to None.
Returns:
The quantized model.
"""
```
### Load API
`neural_compressor.torch` links the save function to the quantized model. If `model.save` already exists, Neural Compressor renames the previous function to `model.orig_save`.
```python
def save(self, output_dir="./saved_results"):
"""
Args:
self (torch.nn.Module): the quantized model.
output_dir (str, optional): path to save the quantized model
"""
```
```python
def load(output_dir="./saved_results", model=None):
"""The main entry of load for all algorithms.
Args:
output_dir (str, optional): path to quantized model folder. Defaults to "./saved_results".
model (torch.nn.Module, optional): original model, suggest to use empty tensor.
Returns:
The quantized model
"""
```
## Supported Matrix
<table class="tg"><thead>
<tr>
<th class="tg-9wq8">Method<br></th>
<th class="tg-9wq8">Algorithm</th>
<th class="tg-9wq8">Backend</th>
<th class="tg-9wq8">Support Status</th>
<th class="tg-9wq8">Usage Link</th>
</tr></thead>
<tbody>
<tr>
<td class="tg-9wq8" rowspan="6">Weight Only Quantization<br></td>
<td class="tg-9wq8">Round to Nearest (RTN)<br></td>
<td class="tg-9wq8">PyTorch eager mode</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_WeightOnlyQuant.md#rtn">link</a></td>
</tr>
<tr>
<td class="tg-9wq8"><a href=https://arxiv.org/abs/2210.17323>GPTQ</a><br></td>
<td class="tg-9wq8">PyTorch eager mode</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_WeightOnlyQuant.md#gptq">link</a></td>
</tr>
<tr>
<td class="tg-9wq8"><a href=https://arxiv.org/abs/2306.00978>AWQ</a></td>
<td class="tg-9wq8">PyTorch eager mode</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_WeightOnlyQuant.md#awq">link</a></td>
</tr>
<tr>
<td class="tg-9wq8"><a href=https://arxiv.org/abs/2309.05516>AutoRound</a></td>
<td class="tg-9wq8">PyTorch eager mode</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_WeightOnlyQuant.md#autoround">link</a></td>
</tr>
<tr>
<td class="tg-9wq8"><a href=https://arxiv.org/abs/2310.10944>TEQ</a></td>
<td class="tg-9wq8">PyTorch eager mode</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_WeightOnlyQuant.md#teq">link</a></td>
</tr>
<tr>
<td class="tg-9wq8"><a href=https://mobiusml.github.io/hqq_blog>HQQ</a></td>
<td class="tg-9wq8">PyTorch eager mode</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_WeightOnlyQuant.md#hqq">link</a></td>
</tr>
<tr>
<td class="tg-9wq8">Smooth Quantization</td>
<td class="tg-9wq8"><a href=https://proceedings.mlr.press/v202/xiao23c.html>SmoothQuant</a></td>
<td class="tg-9wq8"><a href=https://pytorch.org/tutorials/recipes/recipes/intel_extension_for_pytorch.html>intel-extension-for-pytorch</a></td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_SmoothQuant.md">link</a></td>
</tr>
<tr>
<td class="tg-9wq8" rowspan="3">Static Quantization</td>
<td class="tg-9wq8" rowspan="3"><a href=https://pytorch.org/docs/master/quantization.html#post-training-static-quantization>Post-traning Static Quantization</a></td>
<td class="tg-9wq8">intel-extension-for-pytorch (INT8)</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_StaticQuant.md">link</a></td>
</tr>
<tr>
<td class="tg-9wq8"><a href=https://pytorch.org/docs/stable/torch.compiler_deepdive.html>TorchDynamo (INT8)</a></td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_StaticQuant.md">link</a></td>
<tr>
<td class="tg-9wq8"><a href=https://docs.habana.ai/en/latest/index.html>Intel Gaudi AI accelerator (FP8)</a></td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_FP8Quant.md">link</a></td>
</tr>
</tr>
<tr>
<td class="tg-9wq8">Dynamic Quantization</td>
<td class="tg-9wq8"><a href=https://pytorch.org/docs/master/quantization.html#post-training-dynamic-quantization>Post-traning Dynamic Quantization</a></td>
<td class="tg-9wq8">TorchDynamo</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_DynamicQuant.md">link</a></td>
</tr>
<tr>
<td class="tg-9wq8">MX Quantization</td>
<td class="tg-9wq8"><a href=https://arxiv.org/pdf/2310.10537>Microscaling Data Formats for
Deep Learning</a></td>
<td class="tg-9wq8">PyTorch eager mode</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_MXQuant.md">link</a></td>
</tr>
<tr>
<td class="tg-9wq8">Mixed Precision</td>
<td class="tg-9wq8"><a href=https://arxiv.org/abs/1710.03740>Mixed precision</a></td>
<td class="tg-9wq8">PyTorch eager mode</td>
<td class="tg-9wq8">&#10004</td>
<td class="tg-9wq8"><a href="PT_MixPrecision.md">link</a></td>
</tr>
<tr>
<td class="tg-9wq8">Quantization Aware Training</td>
<td class="tg-9wq8"><a href=https://pytorch.org/docs/master/quantization.html#quantization-aware-training-for-static-quantization>Quantization Aware Training</a></td>
<td class="tg-9wq8">TorchDynamo</td>
<td class="tg-9wq8">stay tuned</td>
<td class="tg-9wq8">stay tuned</td>
</tr>
</tbody></table>
## Common Problems
1. How to choose backend between `intel-extension-for-pytorch` and `PyTorchDynamo`?
> Neural Compressor provides automatic logic to detect which backend should be used.
> <table class="tg"><thead>
<tr>
<th class="tg-9wq8">Environment</th>
<th class="tg-9wq8">Automatic Backend</th>
</tr></thead>
<tbody>
<tr>
<td class="tg-9wq8">import torch</td>
<td class="tg-9wq8">torch.dynamo</td>
</tr>
<tr>
<td class="tg-9wq8">import torch<br>import intel-extension-for-pytorch</td>
<td class="tg-9wq8">intel-extension-for-pytorch</td>
</tr>
</tbody>
</table>
2. How to set different configuration for specific op_name or op_type?
> Neural Compressor extends a `set_local` method based on the global configuration object to set custom configuration.
```python
def set_local(self, operator_name_or_list: Union[List, str, Callable], config: BaseConfig) -> BaseConfig:
"""Set custom configuration based on the global configuration object.
Args:
operator_name_or_list (Union[List, str, Callable]): specific operator
config (BaseConfig): specific configuration
"""
```
> Demo:
```python
quant_config = RTNConfig() # Initialize global configuration with default bits=4
quant_config.set_local(".*mlp.*", RTNConfig(bits=8)) # For layers with "mlp" in their names, set bits=8
quant_config.set_local("Conv1d", RTNConfig(dtype="fp32")) # For Conv1d layers, do not quantize them.
```
3. How to specify an accelerator?
> Neural Compressor provides automatic accelerator detection, including HPU, XPU, CUDA, and CPU.
> The automatically detected accelerator may not be suitable for some special cases, such as poor performance, memory limitations. In such situations, users can override the detected accelerator by setting the environment variable `INC_TARGET_DEVICE`.
> Usage:
```bash
export INC_TARGET_DEVICE=cpu
```

View File

@@ -0,0 +1,123 @@
TensorFlow Quantization
===============
1. [Introduction](#introduction)
2. [Usage](#usage)
2.1 [Without Accuracy Aware Tuning](#without-accuracy-aware-tuning)
2.2 [With Accuracy Aware Tuning](#with-accuracy-aware-tuning)
2.3 [Specify Quantization Rules](#specify-quantization-rules)
3. [Examples](#examples)
## Introduction
`neural_compressor.tensorflow` supports quantizing both TensorFlow and Keras model with or without accuracy aware tuning.
For the detailed quantization fundamentals, please refer to the document for [Quantization](quantization.md).
## Get Started
### Without Accuracy Aware Tuning
This means user could leverage Intel(R) Neural Compressor to directly generate a fully quantized model without accuracy aware tuning. It's user responsibility to ensure the accuracy of the quantized model meets expectation.
``` python
# main.py
# Original code
model = tf.keras.applications.resnet50.ResNet50(weights="imagenet")
val_dataset = ...
val_dataloader = MyDataloader(dataset=val_dataset)
# Quantization code
from neural_compressor.tensorflow import quantize_model, StaticQuantConfig
quant_config = StaticQuantConfig()
qmodel = quantize_model(
model=model,
quant_config=quant_config,
calib_dataloader=val_dataloader,
)
qmodel.save("./output")
```
### With Accuracy Aware Tuning
This means user could leverage the advance feature of Intel(R) Neural Compressor to tune out a best quantized model which has best accuracy and good performance. User should provide `eval_fn` and `eval_args`.
``` python
# main.py
# Original code
model = tf.keras.applications.resnet50.ResNet50(weights="imagenet")
val_dataset = ...
val_dataloader = MyDataloader(dataset=val_dataset)
def eval_acc_fn(model) -> float:
...
return acc
# Quantization code
from neural_compressor.common.base_tuning import TuningConfig
from neural_compressor.tensorflow import autotune
# it's also supported to define custom_tune_config as:
# TuningConfig(StaticQuantConfig(weight_sym=[True, False], act_sym=[True, False]))
custom_tune_config = TuningConfig(
config_set=[
StaticQuantConfig(weight_sym=True, act_sym=True),
StaticQuantConfig(weight_sym=False, act_sym=False),
]
)
best_model = autotune(
model=model,
tune_config=custom_tune_config,
eval_fn=eval_acc_fn,
calib_dataloader=val_dataloader,
)
best_model.save("./output")
```
### Specify Quantization Rules
Intel(R) Neural Compressor support specify quantization rules by operator name or operator type. Users can set `local` in dict or use `set_local` method of config class to achieve the above purpose.
1. Example of setting `local` from a dict
```python
quant_config = {
"static_quant": {
"global": {
"weight_dtype": "int8",
"weight_sym": True,
"weight_granularity": "per_tensor",
"act_dtype": "int8",
"act_sym": True,
"act_granularity": "per_tensor",
},
"local": {
"conv1": {
"weight_dtype": "fp32",
"act_dtype": "fp32",
}
},
}
}
config = StaticQuantConfig.from_dict(quant_config)
```
2. Example of using `set_local`
```python
quant_config = StaticQuantConfig()
conv2d_config = StaticQuantConfig(
weight_dtype="fp32",
act_dtype="fp32",
)
quant_config.set_local("conv1", conv2d_config)
```
## Examples
Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/tensorflow) on how to quantize a TensorFlow model with `neural_compressor.tensorflow`.

View File

@@ -0,0 +1,53 @@
# Smooth Quant
- [Smooth Quant](#smooth-quant)
- [Introduction](#introduction)
- [Usage](#usage)
- [Using a Fixed `alpha`](#using-a-fixed-alpha)
- [Determining the `alpha` through auto-tuning](#determining-the-alpha-through-auto-tuning)
- [Examples](#examples)
## Introduction
Quantization is a common compression operation to reduce memory and accelerate inference by converting the floating point matrix to an integer matrix. For large language models (LLMs) with gigantic parameters, the systematic outliers make quantification of activations difficult. [SmoothQuant](https://arxiv.org/abs/2211.10438), a training free post-training quantization (PTQ) solution, offline migrates this difficulty from activations to weights with a mathematically equivalent transformation.
Please refer to the document of [Smooth Quant](../quantization.md/#smooth-quant) for detailed fundamental knowledge.
## Usage
There are two ways to apply smooth quantization: 1) using a fixed `alpha` for the entire model or 2) determining the `alpha` through auto-tuning.
### Using a Fixed `alpha`
To set a fixed alpha for the entire model, users can follow this example:
```python
from neural_compressor.tensorflow import SmoothQuantConfig, StaticQuantConfig
quant_config = [SmoothQuantConfig(alpha=0.5), StaticQuantConfig()]
q_model = quantize_model(output_graph_def, [sq_config, static_config], calib_dataloader)
```
The `SmoothQuantConfig` should be combined with `StaticQuantConfig` in a list because we still need to insert QDQ and apply pattern fusion after the smoothing process.
### Determining the `alpha` through auto-tuning
Users can search for the best `alpha` for the entire model.The tuning process looks for the optimal `alpha` value from a list of `alpha` values provided by the user.
Here is an example:
```python
from neural_compressor.tensorflow import StaticQuantConfig, SmoothQuantConfig
custom_tune_config = TuningConfig(config_set=[SmoothQuantConfig(alpha=[0.5, 0.6, 0.7]), StaticQuantConfig()])
best_model = autotune(
model="fp32_model",
tune_config=custom_tune_config,
eval_fn=eval_fn_wrapper,
calib_dataloader=calib_dataloader,
)
```
> Please note that, it may a considerable amount of time as the tuning process applies each `alpha` to the entire model and uses the evaluation result on the entire dataset as the metric to determine the best `alpha`.
## Examples
Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant) on how to apply smooth quant to a TensorFlow model with `neural_compressor.tensorflow`.

View File

@@ -0,0 +1,223 @@
TensorFlow
===============
- [TensorFlow](#tensorflow)
- [Introduction](#introduction)
- [API for TensorFlow](#api-for-tensorflow)
- [Support Matrix](#support-matrix)
- [Quantization Scheme](#quantization-scheme)
- [Quantization Approaches](#quantization-approaches)
- [Post Training Static Quantization](#post-training-static-quantization)
- [Smooth Quantization](#smooth-quantization)
- [Mixed Precision](#mixed-precison)
- [Backend and Device](#backend-and-device)
## Introduction
`neural_compressor.tensorflow` provides a integrated API for applying quantization on various TensorFlow model format, such as `pb`, `saved_model` and `keras`. The comprehensive range of supported models includes but not limited to CV models, NLP models, and large language models.
In terms of ease of use, neural compressor is committed to providing flexible and scalable user interfaces. While `quantize_model` is designed to provide a fast and straightforward quantization experience, the `autotune` offers an advanced option of reducing accuracy loss during quantization.
## API for TensorFlow
Intel(R) Neural Compressor provides `quantize_model` and `autotune` as main interfaces for supported algorithms on TensorFlow framework.
**quantize_model**
The design philosophy of the `quantize_model` interface is easy-of-use. With minimal parameters requirement, including `model`, `quant_config`, `calib_dataloader`, `calib_iteration`, it offers a straightforward choice of quantizing TF model in one-shot.
```python
def quantize_model(
model: Union[str, tf.keras.Model, BaseModel],
quant_config: Union[BaseConfig, list],
calib_dataloader: Callable = None,
calib_iteration: int = 100,
calib_func: Callable = None,
):
```
`model` should be a string of the model's location, the object of Keras model or INC TF model wrapper class.
`quant_config` is either the `StaticQuantConfig` object or a list contains `SmoothQuantConfig` and `StaticQuantConfig` to indicate what algorithm should be used and what specific quantization rules should be applied.
`calib_dataloader` is used to load the data samples for calibration phase. In most cases, it could be the partial samples of the evaluation dataset.
`calib_iteration` is used to decide how many iterations the calibration process will be run.
`calib_func` is a substitution for `calib_dataloader` when the built-in calibration function of INC does not work for model inference.
Here is a simple example of using `quantize_model` interface with a dummy calibration dataloader and the default `StaticQuantConfig`:
```python
from neural_compressor.tensorflow import StaticQuantConfig, quantize_model
from neural_compressor.tensorflow.utils import DummyDataset
dataset = DummyDataset(shape=(100, 32, 32, 3), label=True)
calib_dataloader = MyDataLoader(dataset=dataset)
quant_config = StaticQuantConfig()
qmodel = quantize_model("fp32_model.pb", quant_config, calib_dataloader)
```
**autotune**
The `autotune` interface, on the other hand, provides greater flexibility and power. It's particularly useful when accuracy is a critical factor. If the initial quantization doesn't meet the tolerance of accuracy loss, `autotune` will iteratively try quantization rules according to the `tune_config`.
Just like `quantize_model`, `autotune` requires `model`, `calib_dataloader` and `calib_iteration`. And the `eval_fn`, `eval_args` are used to build evaluation process.
```python
def autotune(
model: Union[str, tf.keras.Model, BaseModel],
tune_config: TuningConfig,
eval_fn: Callable,
eval_args: Optional[Tuple[Any]] = None,
calib_dataloader: Callable = None,
calib_iteration: int = 100,
calib_func: Callable = None,
) -> Optional[BaseModel]:
```
`model` should be a string of the model's location, the object of Keras model or INC TF model wrapper class.
`tune_config` is the `TuningConfig` object which contains multiple quantization rules.
`eval_fn` is the evaluation function that measures the accuracy of a model.
`eval_args` is the supplemental arguments required by the defined evaluation function.
`calib_dataloader` is used to load the data samples for calibration phase. In most cases, it could be the partial samples of the evaluation dataset.
`calib_iteration` is used to decide how many iterations the calibration process will be run.
`calib_func` is a substitution for `calib_dataloader` when the built-in calibration function of INC does not work for model inference.
Here is a simple example of using `autotune` interface with different quantization rules defined by a list of `StaticQuantConfig`:
```python
from neural_compressor.common.base_tuning import TuningConfig
from neural_compressor.tensorflow import StaticQuantConfig, autotune
calib_dataloader = MyDataloader(dataset=Dataset())
custom_tune_config = TuningConfig(
config_set=[
StaticQuantConfig(weight_sym=True, act_sym=True),
StaticQuantConfig(weight_sym=False, act_sym=False),
]
)
best_model = autotune(
model="baseline_model",
tune_config=custom_tune_config,
eval_fn=eval_acc_fn,
calib_dataloader=calib_dataloader,
)
```
### Support Matrix
#### Quantization Scheme
| Framework | Backend Library | Symmetric Quantization | Asymmetric Quantization |
| :-------------- |:---------------:| ---------------:|---------------:|
| TensorFlow | [oneDNN](https://github.com/oneapi-src/oneDNN) | Activation (int8/uint8), Weight (int8) | - |
| Keras | [ITEX](https://github.com/intel/intel-extension-for-tensorflow) | Activation (int8/uint8), Weight (int8) | - |
+ Symmetric Quantization
+ int8: scale = 2 * max(abs(rmin), abs(rmax)) / (max(int8) - min(int8) - 1)
+ uint8: scale = max(rmin, rmax) / (max(uint8) - min(uint8))
+ oneDNN: [Lower Numerical Precision Deep Learning Inference and Training](https://software.intel.com/content/www/us/en/develop/articles/lower-numerical-precision-deep-learning-inference-and-training.html)
#### Quantization Approaches
The supported Quantization methods for TensorFlow and Keras are listed below:
<table class="center">
<thead>
<tr>
<th>Types</th>
<th>Quantization</th>
<th>Dataset Requirements</th>
<th>Framework</th>
<th>Backend</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="2" align="center">Post-Training Static Quantization (PTQ)</td>
<td rowspan="2" align="center">weights and activations</td>
<td rowspan="2" align="center">calibration</td>
<td align="center">Keras</td>
<td align="center"><a href="https://github.com/intel/intel-extension-for-tensorflow">ITEX</a></td>
</tr>
<tr>
<td align="center">TensorFlow</td>
<td align="center"><a href="https://github.com/tensorflow/tensorflow">TensorFlow</a>/<a href="https://github.com/Intel-tensorflow/tensorflow">Intel TensorFlow</a></td>
</tr>
<tr>
<td rowspan="1" align="center">Smooth Quantization(SQ)</td>
<td rowspan="1" align="center">weights</td>
<td rowspan="1" align="center">calibration</td>
<td align="center">Tensorflow</td>
<td align="center"><a href="https://github.com/tensorflow/tensorflow">TensorFlow</a>/<a href="https://github.com/Intel-tensorflow/tensorflow">Intel TensorFlow</a></td>
</tr>
<tr>
<td rowspan="1" align="center">Mixed Precision(MP)</td>
<td rowspan="1" align="center">weights and activations</td>
<td rowspan="1" align="center">NA</td>
<td align="center">Tensorflow</td>
<td align="center"><a href="https://github.com/tensorflow/tensorflow">TensorFlow</a>/<a href="https://github.com/Intel-tensorflow/tensorflow">Intel TensorFlow</a></td>
</tr>
</tbody>
</table>
<br>
<br>
##### Post Training Static Quantization
The min/max range in weights and activations are collected offline on a so-called `calibration` dataset. This dataset should be able to represent the data distribution of those unseen inference dataset. The `calibration` process runs on the original fp32 model and dumps out all the tensor distributions for `Scale` and `ZeroPoint` calculations. Usually preparing 100 samples are enough for calibration.
Refer to the [PTQ Guide](./TF_Quant.md) for detailed information.
##### Smooth Quantization
Smooth Quantization (SQ) is an advanced quantization technique designed to optimize model performance while maintaining high accuracy. Unlike traditional quantization methods that can lead to significant accuracy loss, SQ focuses on a more refined approach by taking a balance between the scale of activations and weights.
Refer to the [SQ Guide](./TF_SQ.md) for detailed information.
##### Mixed Precision
The Mixed Precision (MP) is enabled with Post Training Static Quantization. Once `BF16` is supported on machine, the matched operators will be automatically converted.
#### Backend and Device
Intel(R) Neural Compressor supports TF GPU with [ITEX-XPU](https://github.com/intel/intel-extension-for-tensorflow). We will automatically run model on GPU by checking if it has been installed.
<table class="center">
<thead>
<tr>
<th>Framework</th>
<th>Backend</th>
<th>Backend Library</th>
<th>Backend Value</th>
<th>Support Device(cpu as default)</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="2" align="left">TensorFlow</td>
<td align="left">TensorFlow</td>
<td align="left">OneDNN</td>
<td align="left">"default"</td>
<td align="left">cpu</td>
</tr>
<tr>
<td align="left">ITEX</td>
<td align="left">OneDNN</td>
<td align="left">"itex"</td>
<td align="left">cpu | gpu</td>
</tr>
</tbody>
</table>
<br>
<br>

View File

@@ -0,0 +1,90 @@
AutoTune
========================================
1. [Overview](#overview)
2. [How it Works](#how-it-works)
3. [Working with Autotune](#working-with-autotune) \
3.1 [Working with PyTorch Model](#working-with-pytorch-model) \
3.1 [Working with Tensorflow Model](#working-with-tensorflow-model)
## Overview
Intel® Neural Compressor aims to help users quickly deploy low-precision models by leveraging popular compression techniques, such as post-training quantization and weight-only quantization algorithms. Despite having a variety of these algorithms, finding the appropriate configuration for a model can be difficult and time-consuming. To address this, we built the `autotune` module based on the [strategy](./tuning_strategies.md) in 2.x for accuracy-aware tuning, which identifies the best algorithm configuration for models to achieve optimal performance under the certain accuracy criteria. This module allows users to easily use predefined tuning recipes and customize the tuning space as needed.
## How it Works
The autotune module constructs the tuning space according to the pre-defined tuning set or users' tuning set. It iterates the tuning space and applies the configuration on given float model then records and compares its evaluation result with the baseline. The tuning process stops when meeting the exit policy.
## Working with Autotune
The `autotune` API is used across all of frameworks supported by INC. It accepts three primary arguments: `model`, `tune_config`, and `eval_fn`.
The `TuningConfig` class defines the tuning process, including the tuning space, order, and exit policy.
- Define the tuning space
User can define the tuning space by setting `config_set` with an algorithm configuration or a set of configurations.
```python
# Use the default tuning space
config_set = get_woq_tuning_config()
# Customize the tuning space with one algorithm configurations
config_set = RTNConfig(use_sym=False, group_size=[32, 64])
# Customize the tuning space with two algorithm configurations
config_set = ([RTNConfig(use_sym=False, group_size=32), GPTQConfig(group_size=128, use_sym=False)],)
```
- Define the tuning order
The tuning order determines how the process traverses the tuning space and samples configurations. Users can customize it by configuring the `sampler`. Currently, we provide the `default_sampler`, which samples configurations sequentially, always in the same order.
- Define the exit policy
The exit policy includes two components: accuracy goal (`tolerable_loss`) and the allowed number of trials (`max_trials`). The tuning process will stop when either condition is met.
### Working with PyTorch Model
The example below demonstrates how to autotune a PyTorch model on four `RTNConfig` configurations.
```python
from neural_compressor.torch.quantization import RTNConfig, TuningConfig, autotune
def eval_fn(model) -> float:
return ...
tune_config = TuningConfig(
config_set=RTNConfig(use_sym=[False, True], group_size=[32, 128]),
tolerable_loss=0.2,
max_trials=10,
)
q_model = autotune(model, tune_config=tune_config, eval_fn=eval_fn)
```
### Working with Tensorflow Model
The example below demonstrates how to autotune a TensorFlow model on two `StaticQuantConfig` configurations.
```python
from neural_compressor.tensorflow.quantization import StaticQuantConfig, autotune
calib_dataloader = MyDataloader(...)
custom_tune_config = TuningConfig(
config_set=[
StaticQuantConfig(weight_sym=True, act_sym=True),
StaticQuantConfig(weight_sym=False, act_sym=False),
]
)
def eval_fn(model) -> float:
return ...
best_model = autotune(
model="baseline_model", tune_config=custom_tune_config, eval_fn=eval_fn, calib_dataloader=calib_dataloader
)
```

View File

@@ -0,0 +1,61 @@
Benchmark
---
1. [Introduction](#introduction)
2. [Supported Matrix](#supported-matrix)
3. [Usage](#usage)
## Introduction
Intel Neural Compressor provides a command `incbench` to launch the Intel CPU performance benchmark.
To get the peak performance on Intel Xeon CPU, we should avoid crossing NUMA node in one instance.
Therefore, by default, `incbench` will trigger 1 instance on the first NUMA node.
## Supported Matrix
| Platform | Status |
|:---:|:---:|
| Linux | &#10004; |
| Windows | &#10004; |
## Usage
| Parameters | Default | comments |
|:----------------------:|:------------------------:|:-------------------------------------:|
| num_instances | 1 | Number of instances |
| num_cores_per_instance | None | Number of cores in each instance |
| C, cores | 0-${num_cores_on_NUMA-1} | decides the visible core range |
| cross_memory | False | whether to allocate memory cross NUMA |
> Note: cross_memory is set to True only when memory is insufficient.
### General Use Cases
1. `incbench main.py`: run 1 instance on NUMA:0.
2. `incbench --num_i 2 main.py`: run 2 instances on NUMA:0.
3. `incbench --num_c 2 main.py`: run multi-instances with 2 cores per instance on NUMA:0.
4. `incbench -C 24-47 main.py`: run 1 instance on COREs:24-47.
5. `incbench -C 24-47 --num_c 4 main.py`: run multi-instances with 4 COREs per instance on COREs:24-47.
> Note:
> - `num_i` works the same as `num_instances`
> - `num_c` works the same as `num_cores_per_instance`
### Dump Throughput and Latency Summary
To merge benchmark results from multi-instances, "incbench" automatically checks log file messages for "throughput" and "latency" information matching the following patterns.
```python
throughput_pattern = r"[T,t]hroughput:\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z/]*)"
latency_pattern = r"[L,l]atency:\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z/]*)"
```
#### Demo usage
```python
print("Throughput: {:.3f} samples/sec".format(throughput))
print("Latency: {:.3f} ms".format(latency * 10**3))
```

View File

@@ -0,0 +1,40 @@
Quantization on Client
==========================================
1. [Introduction](#introduction)
2. [Get Started](#get-started)
## Introduction
For `RTN`, and `GPTQ` algorithms, we provide default algorithm configurations for different processor types (`client` and `sever`). Generally, lightweight configurations are tailored specifically for client devices to enhance performance and efficiency.
## Get Started
Here, we take the `RTN` algorithm as example to demonstrate the usage on a client machine.
```python
from neural_compressor.torch.quantization import get_default_rtn_config, convert, prepare
from neural_compressor.torch import load_empty_model
model_state_dict_path = "/path/to/model/state/dict"
float_model = load_empty_model(model_state_dict_path)
quant_config = get_default_rtn_config()
prepared_model = prepare(float_model, quant_config)
quantized_model = convert(prepared_model)
```
> [!TIP]
> By default, the appropriate configuration is determined based on hardware information, but users can explicitly specify `processor_type` as either `client` or `server` when calling `get_default_rtn_config`.
For Windows machines, run the following command to utilize all available cores automatically:
```bash
python main.py
```
> [!TIP]
> For Linux systems, users need to configure the environment variables appropriately to achieve optimal performance. For example, set the `OMP_NUM_THREADS` explicitly. For processors with hybrid architecture (including both P-cores and E-cores), it is recommended to bind tasks to all P-cores using `taskset`.
RTN quantization is a quick process, finishing in tens of seconds and using several GB of RAM when working with 7B models, e.g.,[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). However, for the higher accuracy, GPTQ algorithm is recommended, but be prepared for a longer quantization time.

View File

@@ -0,0 +1,16 @@
Design
=====
## Architecture
<a target="_blank" href="imgs/architecture.png">
<img src="imgs/architecture.png" alt="Architecture">
</a>
## Workflows
Intel® Neural Compressor provides two workflows: Quantization and Auto-tune.
<a target="_blank" href="imgs/workflow.png">
<img src="imgs/workflow.png" alt="Workflow">
</a>

View File

@@ -0,0 +1,28 @@
### Version mapping between Intel Neural Compressor to Gaudi Software Stack ###
<table>
<thead>
<tr>
<th>Intel Neural Compressor</th>
<th>Gaudi Software Stack</th>
</tr>
</thead>
<tbody>
<tr>
<td>v3.0</td>
<td>v1.17</td>
</tr>
</tbody>
<tbody>
<tr>
<td>v3.1</td>
<td>v1.18</td>
</tr>
</tbody>
<tbody>
<tr>
<td>v3.2</td>
<td>v1.19</td>
</tr>
</tbody>
</table>

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More