update

2025-08-06 15:38:55 +08:00
parent 4916ad0fe0
commit 55a67e817e
193 changed files with 51647 additions and 1 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/49
+++ b/49
@@ -0,0 +1,49 @@
 FROM harbor.4pd.io/inf/base-python3.8-ubuntu:1.1.0
 MAINTAINER shiguangchuan@4paradigm.com
 WORKDIR /workspace
 COPY ssh-keygen /bin
 RUN wget -q ftp://ftp.4pd.io/pub/pico/temp/pynini-2.1.6-cp38-cp38-manylinux_2_31_x86_64.whl && pip install pynini-2.1.6-cp38-cp38-manylinux_2_31_x86_64.whl && rm -f pynini-2.1.6-c    p38-cp38-manylinux_2_31_x86_64.whl
 ADD ./requirements.txt /workspace
 RUN pip install -r ./requirements.txt -i https://nexus.4pd.io/repository/pypi-all/simple --trusted-host nexus.4pd.io --extra-index-url https://mirrors.aliyun.com/pypi/simple/ \
    && pip cache purge \
    && ssh-keygen -f /workspace/ssh-key-ecdsa -t ecdsa -b 521 -q -N ""
 ADD . /workspace
 EXPOSE 80
 CMD ["python3", "run_callback.py"]
 ###########################
 ## Dockerfile（更新后）
 #FROM harbor.4pd.io/lab-platform/inf/python:3.9
 #WORKDIR /app
 ## 安装依赖
 ##RUN pip install torch librosa flask
 ##RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && \
 ##    pip cache purge && \
 ##    pip --default-timeout=1000 install torch librosa flask
 ## 删除原来的 COPY pytorch_model.bin /app/
 #COPY inference.py /app/  
 # 只需要复制启动脚本
 #EXPOSE 80
 #CMD ["python", "inference.py"]
 ####################
 ##############################更新0731#################################
--- a/1
+++ b/1
--- a/config.yaml
+++ b/config.yaml
@@ -0,0 +1,6 @@
 leaderboard_options:
  nfs:
    - name: sid_model
      srcRelativePath: zhoushasha/models/image_models/apple_mobilevit-small
      mountPoint: /model
      source: ceph_customer
--- a/helm-chart/.DS_Store
+++ b/helm-chart/.DS_Store
--- a/helm-chart/README.md
+++ b/helm-chart/README.md
@@ -0,0 +1,77 @@
 ## judgeflow chart 的要求
 ### values.yaml 文件必须包含如下字段，并且模板中必须引用 values.yaml 中的如下字段
 ```
 podLabels
 env
 volumeMounts
 volumes
 affinity
 ```
 ### values.yaml 文件必须在 volumeMounts 中声明如下卷
 ```
 workspace
 submit
 datafile
 ```
 ## 被测服务（sut） chart 的要求
 ### values.yaml 文件必须包含如下字段，并且资源模板中必须引用 values.yaml 中的如下字段
 ```
 podLabels
 affinity
 ```
 针对 podLabels 字段，values.yaml 中配置格式如下：
 ```
 podLabels: {}
 ```
 下面给出示例
 podLabels
 values.yaml
 templates/deployment.yaml
 ```
 metadata:
  labels:
    {{- with .Values.podLabels }}
    {{- toYaml . | nindent 4 }}
    {{- end }}
 ```
 affinity
 values.yaml
 ```
 affinity: {}
 ```
 templates/deployment.yaml
 ```
 spec:
  template:
    spec:
      {{- with .Values.affinity }}
      affinity:
        {{- toYaml . | nindent 8 }}
      {{- end }}
 ```
 ### 如果需要在 sut 中使用共享存储，则 sut chart 的 values.yaml 也必须包含如下字段，且模板中必须引用 values.yaml 中的如下字段
 ```
 volumeMounts
 volumes
 ```
--- a/helm-chart/asr-tco/.helmignore
+++ b/helm-chart/asr-tco/.helmignore
@@ -0,0 +1,23 @@
 # Patterns to ignore when building packages.
 # This supports shell glob matching, relative path matching, and
 # negation (prefixed with !). Only one pattern per line.
 .DS_Store
 # Common VCS dirs
 .git/
 .gitignore
 .bzr/
 .bzrignore
 .hg/
 .hgignore
 .svn/
 # Common backup files
 *.swp
 *.bak
 *.tmp
 *.orig
 *~
 # Various IDEs
 .project
 .idea/
 *.tmproj
 .vscode/
--- a/helm-chart/asr-tco/Chart.yaml.tmpl
+++ b/helm-chart/asr-tco/Chart.yaml.tmpl
@@ -0,0 +1,24 @@
 apiVersion: v2
 name: ${chartName}
 description: Leaderboard judgeflow helm chart for demo
 # A chart can be either an 'application' or a 'library' chart.
 #
 # Application charts are a collection of templates that can be packaged into versioned archives
 # to be deployed.
 #
 # Library charts provide useful utilities or functions for the chart developer. They're included as
 # a dependency of application charts to inject those utilities and functions into the rendering
 # pipeline. Library charts do not define any templates and therefore cannot be deployed.
 type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
 version: ${version}
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
 appVersion: "${appVersion}"
--- a/helm-chart/asr-tco/templates/_helpers.tpl
+++ b/helm-chart/asr-tco/templates/_helpers.tpl
@@ -0,0 +1,62 @@
 {{/*
 Expand the name of the chart.
 */}}
 {{- define "judgeflow.name" -}}
 {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 {{- end }}
 {{/*
 Create a default fully qualified app name.
 We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
 If release name contains chart name it will be used as a full name.
 */}}
 {{- define "judgeflow.fullname" -}}
 {{- if .Values.fullnameOverride }}
 {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
 {{- else }}
 {{- $name := default .Chart.Name .Values.nameOverride }}
 {{- if contains $name .Release.Name }}
 {{- .Release.Name | trunc 63 | trimSuffix "-" }}
 {{- else }}
 {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
 {{- end }}
 {{- end }}
 {{- end }}
 {{/*
 Create chart name and version as used by the chart label.
 */}}
 {{- define "judgeflow.chart" -}}
 {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
 {{- end }}
 {{/*
 Common labels
 */}}
 {{- define "judgeflow.labels" -}}
 helm.sh/chart: {{ include "judgeflow.chart" . }}
 {{ include "judgeflow.selectorLabels" . }}
 {{- if .Chart.AppVersion }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
 {{- end }}
 {{/*
 Selector labels
 */}}
 {{- define "judgeflow.selectorLabels" -}}
 app.kubernetes.io/name: {{ include "judgeflow.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
 {{- end }}
 {{/*
 Create the name of the service account to use
 */}}
 {{- define "judgeflow.serviceAccountName" -}}
 {{- if .Values.serviceAccount.create }}
 {{- default (include "judgeflow.fullname" .) .Values.serviceAccount.name }}
 {{- else }}
 {{- default "default" .Values.serviceAccount.name }}
 {{- end }}
 {{- end }}
--- a/helm-chart/asr-tco/templates/hpa.yaml
+++ b/helm-chart/asr-tco/templates/hpa.yaml
@@ -0,0 +1,32 @@
 {{- if .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
  name: {{ include "judgeflow.fullname" . }}
  labels:
    {{- include "judgeflow.labels" . | nindent 4 }}
 spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: {{ include "judgeflow.fullname" . }}
  minReplicas: {{ .Values.autoscaling.minReplicas }}
  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
  metrics:
    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
    {{- end }}
    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
    {{- end }}
 {{- end }}
--- a/helm-chart/asr-tco/templates/ingress.yaml
+++ b/helm-chart/asr-tco/templates/ingress.yaml
@@ -0,0 +1,61 @@
 {{- if .Values.ingress.enabled -}}
 {{- $fullName := include "judgeflow.fullname" . -}}
 {{- $svcPort := .Values.service.port -}}
 {{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
  {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
  {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
  {{- end }}
 {{- end }}
 {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
 apiVersion: networking.k8s.io/v1
 {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
 apiVersion: networking.k8s.io/v1beta1
 {{- else -}}
 apiVersion: extensions/v1beta1
 {{- end }}
 kind: Ingress
 metadata:
  name: {{ $fullName }}
  labels:
    {{- include "judgeflow.labels" . | nindent 4 }}
  {{- with .Values.ingress.annotations }}
  annotations:
    {{- toYaml . | nindent 4 }}
  {{- end }}
 spec:
  {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
  ingressClassName: {{ .Values.ingress.className }}
  {{- end }}
  {{- if .Values.ingress.tls }}
  tls:
    {{- range .Values.ingress.tls }}
    - hosts:
        {{- range .hosts }}
        - {{ . | quote }}
        {{- end }}
      secretName: {{ .secretName }}
    {{- end }}
  {{- end }}
  rules:
    {{- range .Values.ingress.hosts }}
    - host: {{ .host | quote }}
      http:
        paths:
          {{- range .paths }}
          - path: {{ .path }}
            {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
            pathType: {{ .pathType }}
            {{- end }}
            backend:
              {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
              service:
                name: {{ $fullName }}
                port:
                  number: {{ $svcPort }}
              {{- else }}
              serviceName: {{ $fullName }}
              servicePort: {{ $svcPort }}
              {{- end }}
          {{- end }}
    {{- end }}
 {{- end }}
--- a/helm-chart/asr-tco/templates/job.yaml
+++ b/helm-chart/asr-tco/templates/job.yaml
@@ -0,0 +1,63 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: {{ include "judgeflow.fullname" . }}
  labels:
    {{- include "judgeflow.labels" . | nindent 4 }}
    {{- with .Values.podLabels }}
    {{- toYaml . | nindent 4 }}
    {{- end }}
 spec:
  template:
    metadata:
      labels:
        {{- include "judgeflow.labels" . | nindent 8 }}
        {{- with .Values.podLabels }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
    spec:
      {{- with .Values.priorityclassname }}
      priorityClassName: "{{ . }}"
      {{- end }}
      containers:
        - name: {{ .Chart.Name }}
          securityContext:
            {{- toYaml .Values.securityContext | nindent 12 }}
          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.image.pullPolicy }}
          {{- with .Values.env }}
          env:
            {{- toYaml . | nindent 12 }}
          {{- end }}
          {{- if and (hasKey .Values "service") (hasKey .Values.service "ports") }}
          ports:
            {{- range .Values.service.ports }}
              - name: {{ .name }}
                containerPort: {{ .port }}
            {{- end }}
          {{- end }}
          {{- if hasKey .Values "command" }}
          command: {{ .Values.command }}
          {{- end }}
          volumeMounts:
            {{- toYaml .Values.volumeMounts | nindent 12 }}
          resources:
            {{- toYaml .Values.resources | nindent 12 }}
      restartPolicy: Never
      {{- with .Values.volumes }}
      volumes:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.affinity }}
      affinity:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
  backoffLimit: 0
--- a/helm-chart/asr-tco/templates/priorityclass.yaml
+++ b/helm-chart/asr-tco/templates/priorityclass.yaml
@@ -0,0 +1,10 @@
 {{- if .Values.priorityclassname }}
 apiVersion: scheduling.k8s.io/v1
 kind: PriorityClass
 metadata:
  name: "{{ .Values.priorityclassname }}"
 value: {{ .Values.priorityclassvalue }}
 globalDefault: false
 preemptionPolicy: "Never"
 description: "This is a priority class."
 {{- end }}
--- a/helm-chart/asr-tco/templates/service.yaml
+++ b/helm-chart/asr-tco/templates/service.yaml
@@ -0,0 +1,22 @@
 {{- if and (hasKey .Values "service") (hasKey .Values.service "type") }}
 apiVersion: v1
 kind: Service
 metadata:
  name: {{ include "judgeflow.fullname" . }}
  labels:
    {{- include "judgeflow.labels" . | nindent 4 }}
    {{- with .Values.podLabels }}
    {{- toYaml . | nindent 4 }}
    {{- end }}
 spec:
  type: {{ .Values.service.type }}
  ports:
    {{- range .Values.service.ports }}
      - port: {{ .port }}
        targetPort: {{ .port }}
        protocol: TCP
        name: {{ .name }}
    {{- end }}
  selector:
    {{- include "judgeflow.selectorLabels" . | nindent 4 }}
 {{- end }}
--- a/helm-chart/asr-tco/templates/serviceaccount.yaml
+++ b/helm-chart/asr-tco/templates/serviceaccount.yaml
@@ -0,0 +1,13 @@
 {{- if .Values.serviceAccount.create -}}
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: {{ include "judgeflow.serviceAccountName" . }}
  labels:
    {{- include "judgeflow.labels" . | nindent 4 }}
  {{- with .Values.serviceAccount.annotations }}
  annotations:
    {{- toYaml . | nindent 4 }}
  {{- end }}
 automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
 {{- end }}
--- a/helm-chart/asr-tco/templates/tests/test-connection.yaml
+++ b/helm-chart/asr-tco/templates/tests/test-connection.yaml
@@ -0,0 +1,15 @@
 apiVersion: v1
 kind: Pod
 metadata:
  name: {{ include "judgeflow.fullname" . }}-test-connection
  labels:
    {{- include "judgeflow.labels" . | nindent 4 }}
  annotations:
    "helm.sh/hook": test
 spec:
  containers:
    - name: wget
      image: busybox
      command: ['wget']
      args: ['{{ include "judgeflow.fullname" . }}:{{ .Values.service.port }}']
  restartPolicy: Never
--- a/helm-chart/asr-tco/values.yaml.tmpl
+++ b/helm-chart/asr-tco/values.yaml.tmpl
@@ -0,0 +1,124 @@
 # Default values for job_demo.
 # This is a YAML-formatted file.
 # Declare variables to be passed into your templates.
 replicaCount: 1
 image:
  repository: "${imageRepo}"
  pullPolicy: IfNotPresent
  # Overrides the image tag whose default is the chart appVersion.
  tag: "${imageTag}"
 imagePullSecrets: []
 nameOverride: ""
 fullnameOverride: ""
 serviceAccount:
  # Specifies whether a service account should be created
  create: true
  # Annotations to add to the service account
  annotations: {}
  # The name of the service account to use.
  # If not set and create is true, a name is generated using the fullname template
  name: ""
 podAnnotations: {}
 podLabels: 
  contest.4pd.io/leaderboard-resource-type: judge_flow
  contest.4pd.io/leaderboard-job-id: "0"
  contest.4pd.io/leaderboard-submit-id: "0"
 podSecurityContext: {}
  # fsGroup: 2000
 securityContext: {}
  # capabilities:
  #   drop:
  #   - ALL
  # readOnlyRootFilesystem: true
  # runAsNonRoot: true
  # runAsUser: 1000
 service:
  type: ClusterIP
  ports:
    - name: http
      port: 80
 ingress:
  enabled: false
  className: ""
  annotations: {}
    # kubernetes.io/ingress.class: nginx
    # kubernetes.io/tls-acme: "true"
  hosts:
    - host: chart-example.local
      paths:
        - path: /
          pathType: ImplementationSpecific
  tls: []
  #  - secretName: chart-example-tls
  #    hosts:
  #      - chart-example.local
 resources:
  # We usually recommend not to specify default resources and to leave this as a conscious
  # choice for the user. This also increases chances charts run on environments with little
  # resources, such as Minikube. If you do want to specify resources, uncomment the following
  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
  limits:
    cpu: 3000m
    memory: 16Gi
  requests:
    cpu: 3000m
    memory: 16Gi
 autoscaling:
  enabled: false
  minReplicas: 1
  maxReplicas: 100
  targetCPUUtilizationPercentage: 80
  # targetMemoryUtilizationPercentage: 80
 nodeSelector:
  juicefs: "on"
  contest.4pd.io/cpu: INTEL-8358
 tolerations: []
 affinity: {}
 env:
  - name: TZ
    value: Asia/Shanghai
  - name: MY_POD_IP
    valueFrom:
      fieldRef:
        fieldPath: status.podIP
 #command: '["python","run.py"]'
 volumeMounts:
  - name: workspace
    mountPath: /tmp/workspace
  - name: datafile
    mountPath: /tmp/datafile
  - name: submit
    mountPath: /tmp/submit_config
  - name: juicefs-pv
    mountPath: /tmp/juicefs
  - name: customer
    mountPath: /tmp/customer
  - name: submit-private
    mountPath: /tmp/submit_private
 volumes:
  - name: juicefs-pv
    persistentVolumeClaim:
      claimName: juicefs-pvc
 priorityclassname: ''
 priorityclassvalue: '0'
--- a/helm-chart/sut/.DS_Store
+++ b/helm-chart/sut/.DS_Store
--- a/helm-chart/sut/.helmignore
+++ b/helm-chart/sut/.helmignore
@@ -0,0 +1,23 @@
 # Patterns to ignore when building packages.
 # This supports shell glob matching, relative path matching, and
 # negation (prefixed with !). Only one pattern per line.
 .DS_Store
 # Common VCS dirs
 .git/
 .gitignore
 .bzr/
 .bzrignore
 .hg/
 .hgignore
 .svn/
 # Common backup files
 *.swp
 *.bak
 *.tmp
 *.orig
 *~
 # Various IDEs
 .project
 .idea/
 *.tmproj
 .vscode/
--- a/helm-chart/sut/Chart.yaml
+++ b/helm-chart/sut/Chart.yaml
@@ -0,0 +1,24 @@
 apiVersion: v2
 name: sut
 description: A Helm chart for Kubernetes
 # A chart can be either an 'application' or a 'library' chart.
 #
 # Application charts are a collection of templates that can be packaged into versioned archives
 # to be deployed.
 #
 # Library charts provide useful utilities or functions for the chart developer. They're included as
 # a dependency of application charts to inject those utilities and functions into the rendering
 # pipeline. Library charts do not define any templates and therefore cannot be deployed.
 type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
 version: 0.1.0
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
 appVersion: "0.1.0"
--- a/helm-chart/sut/templates/_helpers.tpl
+++ b/helm-chart/sut/templates/_helpers.tpl
@@ -0,0 +1,62 @@
 {{/*
 Expand the name of the chart.
 */}}
 {{- define "sut.name" -}}
 {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 {{- end }}
 {{/*
 Create a default fully qualified app name.
 We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
 If release name contains chart name it will be used as a full name.
 */}}
 {{- define "sut.fullname" -}}
 {{- if .Values.fullnameOverride }}
 {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
 {{- else }}
 {{- $name := default .Chart.Name .Values.nameOverride }}
 {{- if contains $name .Release.Name }}
 {{- .Release.Name | trunc 63 | trimSuffix "-" }}
 {{- else }}
 {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
 {{- end }}
 {{- end }}
 {{- end }}
 {{/*
 Create chart name and version as used by the chart label.
 */}}
 {{- define "sut.chart" -}}
 {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
 {{- end }}
 {{/*
 Common labels
 */}}
 {{- define "sut.labels" -}}
 helm.sh/chart: {{ include "sut.chart" . }}
 {{ include "sut.selectorLabels" . }}
 {{- if .Chart.AppVersion }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
 {{- end }}
 {{/*
 Selector labels
 */}}
 {{- define "sut.selectorLabels" -}}
 app.kubernetes.io/name: {{ include "sut.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
 {{- end }}
 {{/*
 Create the name of the service account to use
 */}}
 {{- define "sut.serviceAccountName" -}}
 {{- if .Values.serviceAccount.create }}
 {{- default (include "sut.fullname" .) .Values.serviceAccount.name }}
 {{- else }}
 {{- default "default" .Values.serviceAccount.name }}
 {{- end }}
 {{- end }}
--- a/helm-chart/sut/templates/deployment.yaml
+++ b/helm-chart/sut/templates/deployment.yaml
@@ -0,0 +1,94 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: {{ include "sut.fullname" . }}
  labels:
    {{- include "sut.labels" . | nindent 4 }}
    {{- with .Values.podLabels }}
    {{- toYaml . | nindent 4 }}
    {{- end }}
 spec:
  {{- if not .Values.autoscaling.enabled }}
  replicas: {{ .Values.replicaCount }}
  {{- end }}
  selector:
    matchLabels:
      {{- include "sut.selectorLabels" . | nindent 6 }}
  template:
    metadata:
      {{- with .Values.podAnnotations }}
      annotations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      labels:
        {{- include "sut.labels" . | nindent 8 }}
        {{- with .Values.podLabels }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
    spec:
      {{- with .Values.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      serviceAccountName: {{ include "sut.serviceAccountName" . }}
      securityContext:
        {{- toYaml .Values.podSecurityContext | nindent 8 }}
      {{- with .Values.priorityclassname }}
      priorityClassName: "{{ . }}"
      {{- end }}
      containers:
        - name: {{ .Chart.Name }}
          securityContext:
            {{- toYaml .Values.securityContext | nindent 12 }}
          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.image.pullPolicy }}
          {{- with .Values.env }}
          env:
            {{- toYaml . | nindent 12 }}
          {{- end }}
          ports:
            - name: http
              containerPort: {{ .Values.service.port }}
              protocol: TCP
          {{- with .Values.command }}
          command: 
            {{- toYaml . | nindent 12 }}
          {{- end }}
          resources:
            {{- toYaml .Values.resources | nindent 12 }}
          {{- with .Values.volumeMounts }}
          volumeMounts:
            {{- toYaml . | nindent 12 }}
          {{- end }}
          {{- with .Values.livenessProbe }}
          livenessProbe:
            {{- toYaml . | nindent 12 }}
          {{- end }}
          {{- with .Values.readinessProbe }}
          readinessProbe:
            {{- toYaml . | nindent 12 }}
          {{- end }}
          {{- with .Values.startupProbe }}
          startupProbe:
            {{- toYaml . | nindent 12 }}
          {{- end }}
      volumes:
        {{- with .Values.volumes }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
      {{- with .Values.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.affinity }}
      affinity:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      tolerations:
        - key: "hosttype"
          operator: "Equal"
          value: "iluvatar"
          effect: "NoSchedule"
--- a/helm-chart/sut/templates/hpa.yaml
+++ b/helm-chart/sut/templates/hpa.yaml
@@ -0,0 +1,32 @@
 {{- if .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
  name: {{ include "sut.fullname" . }}
  labels:
    {{- include "sut.labels" . | nindent 4 }}
 spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: {{ include "sut.fullname" . }}
  minReplicas: {{ .Values.autoscaling.minReplicas }}
  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
  metrics:
    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
    {{- end }}
    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
    {{- end }}
 {{- end }}
--- a/helm-chart/sut/templates/ingress.yaml
+++ b/helm-chart/sut/templates/ingress.yaml
@@ -0,0 +1,61 @@
 {{- if .Values.ingress.enabled -}}
 {{- $fullName := include "sut.fullname" . -}}
 {{- $svcPort := .Values.service.port -}}
 {{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
  {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
  {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
  {{- end }}
 {{- end }}
 {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
 apiVersion: networking.k8s.io/v1
 {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
 apiVersion: networking.k8s.io/v1beta1
 {{- else -}}
 apiVersion: extensions/v1beta1
 {{- end }}
 kind: Ingress
 metadata:
  name: {{ $fullName }}
  labels:
    {{- include "sut.labels" . | nindent 4 }}
  {{- with .Values.ingress.annotations }}
  annotations:
    {{- toYaml . | nindent 4 }}
  {{- end }}
 spec:
  {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
  ingressClassName: {{ .Values.ingress.className }}
  {{- end }}
  {{- if .Values.ingress.tls }}
  tls:
    {{- range .Values.ingress.tls }}
    - hosts:
        {{- range .hosts }}
        - {{ . | quote }}
        {{- end }}
      secretName: {{ .secretName }}
    {{- end }}
  {{- end }}
  rules:
    {{- range .Values.ingress.hosts }}
    - host: {{ .host | quote }}
      http:
        paths:
          {{- range .paths }}
          - path: {{ .path }}
            {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
            pathType: {{ .pathType }}
            {{- end }}
            backend:
              {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
              service:
                name: {{ $fullName }}
                port:
                  number: {{ $svcPort }}
              {{- else }}
              serviceName: {{ $fullName }}
              servicePort: {{ $svcPort }}
              {{- end }}
          {{- end }}
    {{- end }}
 {{- end }}
--- a/helm-chart/sut/templates/service.yaml
+++ b/helm-chart/sut/templates/service.yaml
@@ -0,0 +1,18 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: {{ include "sut.fullname" . }}
  labels:
    {{- include "sut.labels" . | nindent 4 }}
    {{- with .Values.podLabels }}
    {{- toYaml . | nindent 4 }}
    {{- end }}
 spec:
  type: {{ .Values.service.type }}
  ports:
    - port: {{ .Values.service.port }}
      targetPort: http
      protocol: TCP
      name: socket
  selector:
    {{- include "sut.selectorLabels" . | nindent 4 }}
--- a/helm-chart/sut/templates/serviceaccount.yaml
+++ b/helm-chart/sut/templates/serviceaccount.yaml
@@ -0,0 +1,13 @@
 {{- if .Values.serviceAccount.create -}}
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: {{ include "sut.serviceAccountName" . }}
  labels:
    {{- include "sut.labels" . | nindent 4 }}
  {{- with .Values.serviceAccount.annotations }}
  annotations:
    {{- toYaml . | nindent 4 }}
  {{- end }}
 automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
 {{- end }}
--- a/helm-chart/sut/templates/tests/test-connection.yaml
+++ b/helm-chart/sut/templates/tests/test-connection.yaml
@@ -0,0 +1,15 @@
 apiVersion: v1
 kind: Pod
 metadata:
  name: "{{ include "sut.fullname" . }}-test-connection"
  labels:
    {{- include "sut.labels" . | nindent 4 }}
  annotations:
    "helm.sh/hook": test
 spec:
  containers:
    - name: wget
      image: busybox
      command: ['wget']
      args: ['{{ include "sut.fullname" . }}:{{ .Values.service.port }}']
  restartPolicy: Never
--- a/helm-chart/sut/values.yaml.tmpl
+++ b/helm-chart/sut/values.yaml.tmpl
@@ -0,0 +1,144 @@
 # Default values for sut.
 # This is a YAML-formatted file.
 # Declare variables to be passed into your templates.
 replicaCount: 1
 image:
  repository: harbor.4pd.io/lab-platform/inf/python
  pullPolicy: IfNotPresent
  # Overrides the image tag whose default is the chart appVersion.
  tag: 3.9
 imagePullSecrets: []
 nameOverride: ""
 fullnameOverride: ""
 serviceAccount:
  # Specifies whether a service account should be created
  create: true
  # Automatically mount a ServiceAccount's API credentials?
  automount: true
  # Annotations to add to the service account
  annotations: {}
  # The name of the service account to use.
  # If not set and create is true, a name is generated using the fullname template
  name: ""
 podAnnotations: {}
 podLabels: {}
 podSecurityContext: {}
  # fsGroup: 2000
 securityContext: {}
  # capabilities:
  #   drop:
  #   - ALL
  # readOnlyRootFilesystem: true
  # runAsNonRoot: true
  # runAsUser: 1000
 service:
  type: ClusterIP
  port: 80
 ingress:
  enabled: false
  className: ""
  annotations: {}
    # kubernetes.io/ingress.class: nginx
    # kubernetes.io/tls-acme: "true"
  hosts:
    - host: chart-example.local
      paths:
        - path: /
          pathType: ImplementationSpecific
  tls: []
  #  - secretName: chart-example-tls
  #    hosts:
  #      - chart-example.local
 resources:
  # We usually recommend not to specify default resources and to leave this as a conscious
  # choice for the user. This also increases chances charts run on environments with little
  # resources, such as Minikube. If you do want to specify resources, uncomment the following
  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
  limits:
    cpu: 1000m
    memory: 4096Mi
  requests:
    cpu: 1000m
    memory: 4096Mi
 autoscaling:
  enabled: false
  minReplicas: 1
  maxReplicas: 100
  targetCPUUtilizationPercentage: 80
  # targetMemoryUtilizationPercentage: 80
 # Additional volumes on the output Deployment definition.
 volumes: []
 # - name: foo
 #   secret:
 #     secretName: mysecret
 #     optional: false
 # Additional volumeMounts on the output Deployment definition.
 volumeMounts: []
 # - name: foo
 #   mountPath: "/etc/foo"
 #   readOnly: true
 nodeSelector:
  contest.4pd.io/accelerator: iluvatar-BI-V100
 tolerations: 
  - key: hosttype
    operator: Equal
    value: iluvatar
    effect: NoSchedule
 affinity: {}
 readinessProbe:
  failureThreshold: 1000
  httpGet:
    path: /health
    port: 80
    scheme: HTTP
 #readinessProbe:
 #  httpGet:
 #    path: /health
 #    port: 80
 #    scheme: HTTP
 #  initialDelaySeconds: 5   # 应用启动后等待 5 秒再开始探测
 #  failureThreshold: 5      # 连续失败 3 次后标记为未就绪
 #  successThreshold: 1      # 连续成功 1 次后标记为就绪
 env:
  - name: TZ
    value: Asia/Shanghai
  - name: MY_POD_NAME
    valueFrom:
      fieldRef:
        fieldPath: metadata.name
  - name: MY_POD_NAMESPACE
    valueFrom:
      fieldRef:
        fieldPath: metadata.namespace
  - name: MY_POD_IP
    valueFrom:
      fieldRef:
        fieldPath: status.podIP
  - name: MY_NODE_IP
    valueFrom:
      fieldRef:
        fieldPath: status.hostIP
 #command: ''
 priorityclassname: ''
--- a/local_test.py
+++ b/local_test.py
@@ -0,0 +1,64 @@
 import os
 import tempfile
 import shutil
 if os.path.exists("/tmp/submit_private"):
  shutil.rmtree("/tmp/submit_private")
 with tempfile.TemporaryDirectory() as tempdir:
  config_path = os.path.join(tempdir, "config.json")
  assert not os.system(f"ssh-keygen -f {tempdir}/ssh-key-ecdsa -t ecdsa -b 521 -q -N \"\"")
  config = """
  model: whisper
  model_key: whisper
  config.json:
      name: 'faster-whisper-server:latest'
      support_devices:
      - cpu
      model_path: ''
      port: 8080
      other_ports: []
      other_ports_count: 1
      entrypoint: start.bat
      MIN_CHUNK: 2.5
      MIN_ADD_CHUNK: 2.5
      COMPUTE_TYPE: int8
      NUM_WORKERS: 1
      CPU_THREADS: 2
      BEAM_SIZE: 5
      BATCH: 1
      LANG: auto
      DEVICE: cpu
      CHUNK_LENGTH: 5
      CLASS_MODEL: ./models/faster-whisper-base
      EN_MODEL: ./models/faster-whisper-base
      ZH_MODEL: ./models/faster-whisper-base
      RU_MODEL: ./models/faster-whisper-base
      PT_MODEL: ./models/faster-whisper-base
      AR_MODEL: ./models/faster-whisper-base
      NEW_VERSION: 1
      NEED_RESET: 0
  leaderboard_options:
    nfs:
      - name: whisper
        srcRelativePath: leaderboard/pc_asr/en.tar.gz
        mountPoint: /tmp
        source: ceph_customer
  """
  with open(config_path, "w") as f:
    f.write(config)
  os.environ["SSH_KEY_DIR"] = tempdir
  os.environ["SUBMIT_CONFIG_FILEPATH"] = config_path
  os.environ["MODEL_MAPPING"] = '{"whisper": "edge-ml.tar.gz"}'
  from run_async_a10 import get_sut_url_windows
  print(get_sut_url_windows())
  import time
  time.sleep(3600)
--- a/mock_env.sh
+++ b/mock_env.sh
@@ -0,0 +1,8 @@
 #!/bin/bash
 export DATASET_FILEPATH=dataset/formatted1/de.zip
 export RESULT_FILEPATH=out/result.json
 export DETAILED_CASES_FILEPATH=out/detail_cases.json
 export SUBMIT_CONFIG_FILEPATH=
 export BENCHMARK_NAME=
 export MY_POD_IP=127.0.0.1
--- a/model_test_caltech_3.py
+++ b/model_test_caltech_3.py
@@ -0,0 +1,215 @@
 import requests
 import json
 import torch
 from PIL import Image
 from io import BytesIO
 from transformers import BeitImageProcessor, BeitForImageClassification
 # 根据模型实际架构选择类
 from transformers import ViTForImageClassification, BeitForImageClassification
 from tqdm import tqdm
 from transformers import AutoConfig
 from transformers import AutoImageProcessor, AutoModelForImageClassification
 import os
 import random
 import time  # 新增导入时间模块
 # 支持 Iluvatar GPU 加速，若不可用则使用 CPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"当前使用的设备: {device}")  # 添加调试信息
 # 若有多块 GPU，可使用 DataParallel 进行并行计算
 if torch.cuda.device_count() > 1:
    print(f"使用 {torch.cuda.device_count()} 块 GPU 进行计算")
 class COCOImageClassifier:
    def __init__(self, model_path: str, local_image_paths: list):
        """初始化COCO图像分类器"""
        self.processor = AutoImageProcessor.from_pretrained(model_path)
        self.model = AutoModelForImageClassification.from_pretrained(model_path)
        # 将模型移动到设备
        self.model = self.model.to(device)
        print(f"模型是否在 GPU 上: {next(self.model.parameters()).is_cuda}")  # 添加调试信息
        # 若有多块 GPU，使用 DataParallel
        if torch.cuda.device_count() > 1:
            self.model = torch.nn.DataParallel(self.model)
        self.id2label = self.model.module.config.id2label if hasattr(self.model, 'module') else self.model.config.id2label
        self.local_image_paths = local_image_paths
    def predict_image_path(self, image_path: str, top_k: int = 5) -> dict:
        """
        预测本地图片文件对应的图片类别
        Args:
            image_path: 本地图片文件路径
            top_k: 返回置信度最高的前k个类别
        Returns:
            包含预测结果的字典
        """
        try:
            # 打开图片
            image = Image.open(image_path).convert("RGB")
            # 预处理
            inputs = self.processor(images=image, return_tensors="pt")
            # 将输入数据移动到设备
            inputs = inputs.to(device)
            # 模型推理
            with torch.no_grad():
                outputs = self.model(**inputs)
            # 获取预测结果
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            top_probs, top_indices = probs.topk(top_k, dim=1)
            # 整理结果
            predictions = []
            for i in range(top_k):
                class_idx = top_indices[0, i].item()
                confidence = top_probs[0, i].item()
                predictions.append({
                    "class_id": class_idx,
                    "class_name": self.id2label[class_idx],
                    "confidence": confidence
                })
            return {
                "image_path": image_path,
                "predictions": predictions
            }
        except Exception as e:
            print(f"处理图片文件 {image_path} 时出错: {e}")
            return None
    def batch_predict(self, limit: int = 20, top_k: int = 5) -> list:
        """
        批量预测本地图片
        Args:
            limit: 限制处理的图片数量
            top_k: 返回置信度最高的前k个类别
        Returns:
            包含所有预测结果的列表
        """
        results = []
        local_image_paths = self.local_image_paths[:limit]
        print(f"开始预测 {len(local_image_paths)} 张本地图片...")
        start_time = time.time()  # 记录开始时间
        for image_path in tqdm(local_image_paths):
            result = self.predict_image_path(image_path, top_k)
            if result:
                results.append(result)
        end_time = time.time()  # 记录结束时间
        total_time = end_time - start_time  # 计算总时间
        images_per_second = len(results) / total_time  # 计算每秒处理的图片数量
        print(f"模型每秒可以处理 {images_per_second:.2f} 张图片")
        return results
    def save_results(self, results: list, output_file: str = "caltech_predictions.json"):
        """
        保存预测结果到JSON文件
        Args:
            results: 预测结果列表
            output_file: 输出文件名
        """
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"结果已保存到 {output_file}")
 # 主程序
 if __name__ == "__main__":
    # 替换为本地模型路径
    LOCAL_MODEL_PATH = "/home/zhoushasha/models/microsoft_beit_base_patch16_224_pt22k_ft22k"
    # 替换为Caltech 256数据集文件夹路径
    CALTECH_256_PATH = "/home/zhoushasha/models/256ObjectCategoriesNew"
    local_image_paths = []
    true_labels = {}
    # 遍历Caltech 256数据集中的每个文件夹
    for folder in os.listdir(CALTECH_256_PATH):
        folder_path = os.path.join(CALTECH_256_PATH, folder)
        if os.path.isdir(folder_path):
            # 获取文件夹名称中的类别名称
            class_name = folder.split('.', 1)[1]
            # 获取文件夹中的所有图片文件
            image_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
            # 随机选择3张图片
            selected_images = random.sample(image_files, min(3, len(image_files)))
            for image_path in selected_images:
                local_image_paths.append(image_path)
                true_labels[image_path] = class_name
    # 创建分类器实例
    classifier = COCOImageClassifier(LOCAL_MODEL_PATH, local_image_paths)
    # 批量预测
    results = classifier.batch_predict(limit=len(local_image_paths), top_k=3)
    # 保存结果
    classifier.save_results(results)
    # 打印简要统计
    print(f"\n处理完成: 成功预测 {len(results)} 张图片")
    if results:
        print("\n示例预测结果:")
        sample = results[0]
        print(f"图片路径: {sample['image_path']}")
        for i, pred in enumerate(sample['predictions'], 1):
            print(f"{i}. {pred['class_name']} (置信度: {pred['confidence']:.2%})")
    correct_count = 0
    total_count = len(results)
    # 统计每个类别的实际样本数和正确预测数
    class_actual_count = {}
    class_correct_count = {}
    for prediction in results:
        image_path = prediction['image_path']
        top1_prediction = max(prediction['predictions'], key=lambda x: x['confidence'])
        predicted_class = top1_prediction['class_name'].lower()
        true_class = true_labels.get(image_path).lower()
        # 统计每个类别的实际样本数
        if true_class not in class_actual_count:
            class_actual_count[true_class] = 0
        class_actual_count[true_class] += 1
        # 检查预测类别中的每个单词是否包含真实标签
        words = predicted_class.split()
        for word in words:
            if true_class in word:
                correct_count += 1
                # 统计每个类别的正确预测数
                if true_class not in class_correct_count:
                    class_correct_count[true_class] = 0
                class_correct_count[true_class] += 1
                break
    accuracy = correct_count / total_count
    print(f"\nAccuracy: {accuracy * 100:.2f}%")
    # 计算每个类别的召回率
    recall_per_class = {}
    for class_name in class_actual_count:
        if class_name in class_correct_count:
            recall_per_class[class_name] = class_correct_count[class_name] / class_actual_count[class_name]
        else:
            recall_per_class[class_name] = 0
    # 计算平均召回率
    average_recall = sum(recall_per_class.values()) / len(recall_per_class)
    print(f"\nAverage Recall: {average_recall * 100:.2f}%")
--- a/model_test_caltech_cpu1.py
+++ b/model_test_caltech_cpu1.py
@@ -0,0 +1,197 @@
 import requests
 import json
 import torch
 from PIL import Image
 from io import BytesIO
 from transformers import AutoImageProcessor, AutoModelForImageClassification
 from tqdm import tqdm
 import os
 import random
 import time
 # 强制使用CPU
 device = torch.device("cpu")
 print(f"当前使用的设备: {device}")
 class COCOImageClassifier:
    def __init__(self, model_path: str, local_image_paths: list):
        """初始化COCO图像分类器"""
        self.processor = AutoImageProcessor.from_pretrained(model_path)
        self.model = AutoModelForImageClassification.from_pretrained(model_path)
        # 将模型移动到CPU
        self.model = self.model.to(device)
        self.id2label = self.model.config.id2label
        self.local_image_paths = local_image_paths
    def predict_image_path(self, image_path: str, top_k: int = 5) -> dict:
        """
        预测本地图片文件对应的图片类别
        Args:
            image_path: 本地图片文件路径
            top_k: 返回置信度最高的前k个类别
        Returns:
            包含预测结果的字典
        """
        try:
            # 打开图片
            image = Image.open(image_path).convert("RGB")
            # 预处理
            inputs = self.processor(images=image, return_tensors="pt")
            # 将输入数据移动到CPU
            inputs = inputs.to(device)
            # 模型推理
            with torch.no_grad():
                outputs = self.model(**inputs)
            # 获取预测结果
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            top_probs, top_indices = probs.topk(top_k, dim=1)
            # 整理结果
            predictions = []
            for i in range(top_k):
                class_idx = top_indices[0, i].item()
                confidence = top_probs[0, i].item()
                predictions.append({
                    "class_id": class_idx,
                    "class_name": self.id2label[class_idx],
                    "confidence": confidence
                })
            return {
                "image_path": image_path,
                "predictions": predictions
            }
        except Exception as e:
            print(f"处理图片文件 {image_path} 时出错: {e}")
            return None
    def batch_predict(self, limit: int = 20, top_k: int = 5) -> list:
        """
        批量预测本地图片
        Args:
            limit: 限制处理的图片数量
            top_k: 返回置信度最高的前k个类别
        Returns:
            包含所有预测结果的列表
        """
        results = []
        local_image_paths = self.local_image_paths[:limit]
        print(f"开始预测 {len(local_image_paths)} 张本地图片...")
        start_time = time.time()
        for image_path in tqdm(local_image_paths):
            result = self.predict_image_path(image_path, top_k)
            if result:
                results.append(result)
        end_time = time.time()
        # 计算吞吐量
        throughput = len(results) / (end_time - start_time)
        print(f"模型每秒可以处理 {throughput:.2f} 张图片")
        return results
    def save_results(self, results: list, output_file: str = "celtech_cpu_predictions.json"):
        """
        保存预测结果到JSON文件
        Args:
            results: 预测结果列表
            output_file: 输出文件名
        """
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"结果已保存到 {output_file}")
 # 主程序
 if __name__ == "__main__":
    # 替换为本地模型路径
    LOCAL_MODEL_PATH = "/home/zhoushasha/models/microsoft_beit_base_patch16_224_pt22k_ft22k"
    # 替换为Caltech 256数据集文件夹路径 New
    CALTECH_256_PATH = "/home/zhoushasha/models/256ObjectCategoriesNew"
    local_image_paths = []
    true_labels = {}
    # 遍历Caltech 256数据集中的每个文件夹
    for folder in os.listdir(CALTECH_256_PATH):
        folder_path = os.path.join(CALTECH_256_PATH, folder)
        if os.path.isdir(folder_path):
            # 获取文件夹名称中的类别名称
            class_name = folder.split('.', 1)[1]
            # 获取文件夹中的所有图片文件
            image_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
            # 随机选择3张图片
            selected_images = random.sample(image_files, min(3, len(image_files)))
            for image_path in selected_images:
                local_image_paths.append(image_path)
                true_labels[image_path] = class_name
    # 创建分类器实例
    classifier = COCOImageClassifier(LOCAL_MODEL_PATH, local_image_paths)
    # 批量预测
    results = classifier.batch_predict(limit=len(local_image_paths), top_k=3)
    # 保存结果
    classifier.save_results(results)
    # 打印简要统计
    print(f"\n处理完成: 成功预测 {len(results)} 张图片")
    if results:
        print("\n示例预测结果:")
        sample = results[0]
        print(f"图片路径: {sample['image_path']}")
        for i, pred in enumerate(sample['predictions'], 1):
            print(f"{i}. {pred['class_name']} (置信度: {pred['confidence']:.2%})")
    correct_count = 0
    total_count = len(results)
    class_true_positives = {}
    class_false_negatives = {}
    for prediction in results:
        image_path = prediction['image_path']
        top1_prediction = max(prediction['predictions'], key=lambda x: x['confidence'])
        predicted_class = top1_prediction['class_name'].lower()
        true_class = true_labels.get(image_path).lower()
        if true_class not in class_true_positives:
            class_true_positives[true_class] = 0
            class_false_negatives[true_class] = 0
        # 检查预测类别中的每个单词是否包含真实标签
        words = predicted_class.split()
        for word in words:
            if true_class in word:
                correct_count += 1
                class_true_positives[true_class] += 1
                break
        else:
            class_false_negatives[true_class] += 1
    accuracy = correct_count / total_count
    print(f"\nAccuracy: {accuracy * 100:.2f}%")
    # 计算召回率
    total_true_positives = 0
    total_false_negatives = 0
    for class_name in class_true_positives:
        total_true_positives += class_true_positives[class_name]
        total_false_negatives += class_false_negatives[class_name]
    recall = total_true_positives / (total_true_positives + total_false_negatives)
    print(f"Recall: {recall * 100:.2f}%")
--- a/model_test_caltech_http.py
+++ b/model_test_caltech_http.py
@@ -0,0 +1,166 @@
 import torch
 import time
 import os
 import multiprocessing
 from PIL import Image
 from transformers import AutoImageProcessor, AutoModelForImageClassification
 from flask import Flask, request, jsonify
 from io import BytesIO
 # 设置CPU核心数为4
 os.environ["OMP_NUM_THREADS"] = "4"
 os.environ["MKL_NUM_THREADS"] = "4"
 os.environ["NUMEXPR_NUM_THREADS"] = "4"
 os.environ["OPENBLAS_NUM_THREADS"] = "4"
 os.environ["VECLIB_MAXIMUM_THREADS"] = "4"
 torch.set_num_threads(4)  # 设置PyTorch的CPU线程数
 # 设备配置
 device_cuda = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 device_cpu = torch.device("cpu")
 print(f"当前CUDA设备: {device_cuda}, CPU设备: {device_cpu}")
 print(f"CPU核心数设置: {torch.get_num_threads()}")
 class ImageClassifier:
    def __init__(self, model_path: str):
        self.processor = AutoImageProcessor.from_pretrained(model_path)
        # 分别加载GPU和CPU模型实例
        if device_cuda.type == "cuda":
            self.model_cuda = AutoModelForImageClassification.from_pretrained(model_path).to(device_cuda)
        else:
            self.model_cuda = None  # 若没有CUDA，则不加载
        self.model_cpu = AutoModelForImageClassification.from_pretrained(model_path).to(device_cpu)
        # 保存id2label映射
        self.id2label = self.model_cpu.config.id2label
    def _predict_with_model(self, image, model, device) -> dict:
        """使用指定模型和设备执行预测，包含单独计时"""
        try:
            # 记录开始时间
            start_time = time.perf_counter()  # 使用更精确的计时函数
            # 处理图片并移动到目标设备
            inputs = self.processor(images=image, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = model(** inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            max_prob, max_idx = probs.max(dim=1)
            class_idx = max_idx.item()
            # 计算处理时间（秒），保留6位小数
            processing_time = round(time.perf_counter() - start_time, 6)
            return {
                "class_id": class_idx,
                "class_name": self.id2label[class_idx],
                "confidence": float(max_prob.item()),
                "device_used": str(device),
                "processing_time": processing_time  # 处理时间
            }
        except Exception as e:
            return {
                "class_id": -1,
                "class_name": "error",
                "confidence": 0.0,
                "device_used": str(device),
                "processing_time": 0.0,
                "error": str(e)
            }
    def predict_single_image(self, image) -> dict:
        """预测单张图片，分别使用GPU和CPU模型"""
        results = {"status": "success"}
        # GPU预测（如果可用）
        if self.model_cuda is not None:
            cuda_result = self._predict_with_model(image, self.model_cuda, device_cuda)
        else:
            cuda_result = {
                "class_id": -1,
                "class_name": "error",
                "confidence": 0.0,
                "device_used": str(device_cuda),
                "processing_time": 0.0,
                "error": "CUDA设备不可用，未加载CUDA模型"
            }
        results["cuda_prediction"] = cuda_result
        # CPU预测（已限制为4核心）
        cpu_result = self._predict_with_model(image, self.model_cpu, device_cpu)
        results["cpu_prediction"] = cpu_result
        return results
 # 初始化服务
 app = Flask(__name__)
 MODEL_PATH = os.environ.get("MODEL_PATH", "/model")  # 模型路径（环境变量或默认路径）
 classifier = ImageClassifier(MODEL_PATH)
@app.route('/v1/private/s782b4996', methods=['POST'])
 def predict_single():
    """接收单张图片并返回预测结果及处理时间"""
    if 'image' not in request.files:
        return jsonify({
            "status": "error",
            "cuda_prediction": {
                "class_id": -1,
                "class_name": "error",
                "confidence": 0.0,
                "device_used": str(device_cuda),
                "processing_time": 0.0,
                "error": "请求中未包含图片"
            },
            "cpu_prediction": {
                "class_id": -1,
                "class_name": "error",
                "confidence": 0.0,
                "device_used": str(device_cpu),
                "processing_time": 0.0,
                "error": "请求中未包含图片"
            }
        }), 400
    image_file = request.files['image']
    try:
        image = Image.open(BytesIO(image_file.read())).convert("RGB")
        result = classifier.predict_single_image(image)
        return jsonify(result)
    except Exception as e:
        return jsonify({
            "status": "error",
            "cuda_prediction": {
                "class_id": -1,
                "class_name": "error",
                "confidence": 0.0,
                "device_used": str(device_cuda),
                "processing_time": 0.0,
                "error": str(e)
            },
            "cpu_prediction": {
                "class_id": -1,
                "class_name": "error",
                "confidence": 0.0,
                "device_used": str(device_cpu),
                "processing_time": 0.0,
                "error": str(e)
            }
        }), 500
@app.route('/health', methods=['GET'])
 def health_check():
    return jsonify({
        "status": "healthy", 
        "cuda_available": device_cuda.type == "cuda",
        "cuda_device": str(device_cuda),
        "cpu_device": str(device_cpu),
        "cpu_threads": torch.get_num_threads()  # 显示CPU线程数
    }), 200
 if __name__ == "__main__":
    app.run(host='0.0.0.0', port=80, debug=False)
--- a/model_test_caltech_http_1.py
+++ b/model_test_caltech_http_1.py
@@ -0,0 +1,163 @@
 import requests
 import json
 import torch
 from PIL import Image
 from io import BytesIO
 from transformers import AutoImageProcessor, AutoModelForImageClassification
 from tqdm import tqdm
 import os
 import random
 import time
 from flask import Flask, request, jsonify  # 引入Flask
 # 设备配置
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"当前使用的设备: {device}")
 class COCOImageClassifier:
    def __init__(self, model_path: str):
        """初始化分类器（移除local_image_paths参数，改为动态接收）"""
        self.processor = AutoImageProcessor.from_pretrained(model_path)
        self.model = AutoModelForImageClassification.from_pretrained(model_path)
        self.model = self.model.to(device)
        if torch.cuda.device_count() > 1:
            print(f"使用 {torch.cuda.device_count()} 块GPU")
            self.model = torch.nn.DataParallel(self.model)
        self.id2label = self.model.module.config.id2label if hasattr(self.model, 'module') else self.model.config.id2label
    def predict_image_path(self, image_path: str, top_k: int = 5) -> dict:
        """预测单张图片（复用原逻辑）"""
        try:
            image = Image.open(image_path).convert("RGB")
            inputs = self.processor(images=image, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = self.model(** inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            top_probs, top_indices = probs.topk(top_k, dim=1)
            predictions = []
            for i in range(top_k):
                class_idx = top_indices[0, i].item()
                predictions.append({
                    "class_id": class_idx,
                    "class_name": self.id2label[class_idx],
                    "confidence": top_probs[0, i].item()
                })
            return {
                "image_path": image_path,
                "predictions": predictions
            }
        except Exception as e:
            print(f"处理图片 {image_path} 出错: {e}")
            return None
    def batch_predict_and_evaluate(self, image_paths: list, true_labels: dict, top_k: int = 3) -> dict:
        """批量预测并计算准确率、召回率"""
        results = []
        start_time = time.time()
        for image_path in tqdm(image_paths):
            result = self.predict_image_path(image_path, top_k)
            if result:
                results.append(result)
        end_time = time.time()
        total_time = end_time - start_time
        images_per_second = len(results) / total_time if total_time > 0 else 0
        # 计算准确率和召回率（复用原逻辑）
        correct_count = 0
        total_count = len(results)
        class_actual_count = {}
        class_correct_count = {}
        for prediction in results:
            image_path = prediction['image_path']
            top1_prediction = max(prediction['predictions'], key=lambda x: x['confidence'])
            predicted_class = top1_prediction['class_name'].lower()
            true_class = true_labels.get(image_path, "").lower()
            # 统计每个类别的实际样本数
            class_actual_count[true_class] = class_actual_count.get(true_class, 0) + 1
            # 检查预测是否正确
            words = predicted_class.split()
            for word in words:
                if true_class in word:
                    correct_count += 1
                    class_correct_count[true_class] = class_correct_count.get(true_class, 0) + 1
                    break
        # 计算指标
        accuracy = correct_count / total_count if total_count > 0 else 0
        recall_per_class = {}
        for class_name in class_actual_count:
            recall_per_class[class_name] = class_correct_count.get(class_name, 0) / class_actual_count[class_name]
        average_recall = sum(recall_per_class.values()) / len(recall_per_class) if recall_per_class else 0
        # 返回包含指标的结果
        return {
            "status": "success",
            "metrics": {
                "accuracy": round(accuracy * 100, 2),  # 百分比
                "average_recall": round(average_recall * 100, 2),  # 百分比
                "total_images": total_count,
                "correct_predictions": correct_count,
                "speed_images_per_second": round(images_per_second, 2)
            },
            "sample_predictions": results[:3]  # 示例预测结果（可选）
        }
 # 初始化Flask服务
 app = Flask(__name__)
 MODEL_PATH = os.environ.get("MODEL_PATH", "/model")  # 容器内模型路径
 DATASET_PATH = os.environ.get("DATASET_PATH", "/app/dataset")  # 容器内数据集路径
 classifier = COCOImageClassifier(MODEL_PATH)
@app.route('/v1/private/s782b4996', methods=['POST'])
 def evaluate():
    """接收请求并返回评估结果（准确率、召回率等）"""
    try:
        # 解析请求参数（可选：允许动态指定limit等参数）
        data = request.get_json()
        limit = data.get("limit", 20)  # 限制处理的图片数量
        # 加载数据集（容器内路径）
        local_image_paths = []
        true_labels = {}
        for folder in os.listdir(DATASET_PATH):
            folder_path = os.path.join(DATASET_PATH, folder)
            if os.path.isdir(folder_path):
                class_name = folder.split('.', 1)[1]
                image_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
                selected_images = random.sample(image_files, min(3, len(image_files)))
                for image_path in selected_images:
                    local_image_paths.append(image_path)
                    true_labels[image_path] = class_name
        # 限制处理数量
        local_image_paths = local_image_paths[:limit]
        # 执行预测和评估
        result = classifier.batch_predict_and_evaluate(local_image_paths, true_labels, top_k=3)
        return jsonify(result)
    except Exception as e:
        return jsonify({
            "status": "error",
            "message": str(e)
        }), 500
@app.route('/health', methods=['GET'])
 def health_check():
    return jsonify({"status": "healthy", "device": str(device)}), 200
 if __name__ == "__main__":
    app.run(host='0.0.0.0', port=8000, debug=False)
--- a/model_test_caltech_http_3.py
+++ b/model_test_caltech_http_3.py
@@ -0,0 +1,89 @@
 import torch
 from PIL import Image
 from transformers import AutoImageProcessor, AutoModelForImageClassification
 import os
 from flask import Flask, request, jsonify
 from io import BytesIO
 # 设备配置
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"当前使用的设备: {device}")
 class ImageClassifier:
    def __init__(self, model_path: str):
        # 获取模型路径下的第一个子目录（假设模型文件存放在这里）
        subdirs = [d for d in os.listdir(model_path) if os.path.isdir(os.path.join(model_path, d))]
        if not subdirs:
            raise ValueError(f"在 {model_path} 下未找到任何子目录，无法加载模型")
        # 实际的模型文件路径
        actual_model_path = os.path.join(model_path, subdirs[0])
        print(f"加载模型从: {actual_model_path}")
        self.processor = AutoImageProcessor.from_pretrained(actual_model_path)
        self.model = AutoModelForImageClassification.from_pretrained(actual_model_path)
        self.model = self.model.to(device)
        if torch.cuda.device_count() > 1:
            print(f"使用 {torch.cuda.device_count()} 块GPU")
            self.model = torch.nn.DataParallel(self.model)
        self.id2label = self.model.module.config.id2label if hasattr(self.model, 'module') else self.model.config.id2label
    def predict_single_image(self, image) -> dict:
        """预测单张图片，返回置信度最高的结果"""
        try:
            # 处理图片
            inputs = self.processor(images=image, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = self.model(** inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            # 获取置信度最高的预测结果
            max_prob, max_idx = probs.max(dim=1)
            class_idx = max_idx.item()
            return {
                "status": "success",
                "top_prediction": {
                    "class_id": class_idx,
                    "class_name": self.id2label[class_idx],
                    "confidence": max_prob.item()
                }
            }
        except Exception as e:
            return {
                "status": "error",
                "message": str(e)
            }
 # 初始化服务
 app = Flask(__name__)
 MODEL_PATH = os.environ.get("MODEL_PATH", "/model")  # 模型根路径（环境变量或默认路径）
 classifier = ImageClassifier(MODEL_PATH)
@app.route('/v1/private/s782b4996', methods=['POST'])
 def predict_single():
    """接收单张图片并返回最高置信度预测结果"""
    # 检查是否有图片上传
    if 'image' not in request.files:
        return jsonify({"status": "error", "message": "请求中未包含图片"}), 400
    image_file = request.files['image']
    try:
        # 读取图片
        image = Image.open(BytesIO(image_file.read())).convert("RGB")
        # 预测
        result = classifier.predict_single_image(image)
        return jsonify(result)
    except Exception as e:
        return jsonify({"status": "error", "message": str(e)}), 500
@app.route('/health', methods=['GET'])
 def health_check():
    return jsonify({"status": "healthy", "device": str(device)}), 200
 if __name__ == "__main__":
    app.run(host='0.0.0.0', port=8000, debug=False)
--- a/model_test_caltech_http_cuda.py
+++ b/model_test_caltech_http_cuda.py
@@ -0,0 +1,80 @@
 import torch
 from PIL import Image
 from transformers import AutoImageProcessor, AutoModelForImageClassification
 import os
 from flask import Flask, request, jsonify
 from io import BytesIO
 # 设备配置
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"当前使用的设备: {device}")
 class ImageClassifier:
    def __init__(self, model_path: str):
        self.processor = AutoImageProcessor.from_pretrained(model_path)
        self.model = AutoModelForImageClassification.from_pretrained(model_path)
        self.model = self.model.to(device)
        if torch.cuda.device_count() > 1:
            print(f"使用 {torch.cuda.device_count()} 块GPU")
            self.model = torch.nn.DataParallel(self.model)
        self.id2label = self.model.module.config.id2label if hasattr(self.model, 'module') else self.model.config.id2label
    def predict_single_image(self, image) -> dict:
        """预测单张图片，返回置信度最高的结果"""
        try:
            # 处理图片
            inputs = self.processor(images=image, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            # 获取置信度最高的预测结果
            max_prob, max_idx = probs.max(dim=1)
            class_idx = max_idx.item()
            return {
                "status": "success",
                "top_prediction": {
                    "class_id": class_idx,
                    "class_name": self.id2label[class_idx],
                    "confidence": max_prob.item()
                }
            }
        except Exception as e:
            return {
                "status": "error",
                "message": str(e)
            }
 # 初始化服务
 app = Flask(__name__)
 MODEL_PATH = os.environ.get("MODEL_PATH", "/model")  # 模型路径（环境变量或默认路径）
 classifier = ImageClassifier(MODEL_PATH)
@app.route('/v1/private/s782b4996', methods=['POST'])
 def predict_single():
    """接收单张图片并返回最高置信度预测结果"""
    # 检查是否有图片上传
    if 'image' not in request.files:
        return jsonify({"status": "error", "message": "请求中未包含图片"}), 400
    image_file = request.files['image']
    try:
        # 读取图片
        image = Image.open(BytesIO(image_file.read())).convert("RGB")
        # 预测
        result = classifier.predict_single_image(image)
        return jsonify(result)
    except Exception as e:
        return jsonify({"status": "error", "message": str(e)}), 500
@app.route('/health', methods=['GET'])
 def health_check():
    return jsonify({"status": "healthy", "device": str(device)}), 200
 if __name__ == "__main__":
    app.run(host='0.0.0.0', port=80, debug=False)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,24 @@
 [tool.black]
 line-length = 80
 target-version = ['py39']
 [tool.flake8]
 max-line-length = 88
 count=true
 per-file-ignores="./annotation/manager.py:F401"
 exclude=["./label", "__pycache__", "./migrations", "./logs", "./pids", "./resources"]
 ignore=["W503", "E203"]
 enable-extensions="G"
 application-import-names=["flake8-isort", "flake8-logging-format", "flake8-builtins"]
 import-order-style="edited"
 extend-ignore = ["E203", "E701"]
 [tool.isort]
 py_version=39
 profile="black"
 multi_line_output=9
 line_length=80
 group_by_package=true
 case_sensitive=true
 skip_gitignore=true
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,13 @@
 requests
 ruamel.yaml
 regex
 pyyaml
 websocket-client==0.44.0
 pydantic==2.6.4
 pydantic_core==2.16.3
 Levenshtein
 numpy
 websockets
 fabric
 vmplatform==0.0.4
 flask
--- a/run.py
+++ b/run.py
@@ -0,0 +1,114 @@
 import gc
 import json
 import os
 import sys
 import time
 import zipfile
 import yaml
 from schemas.context import ASRContext
 from utils.client import Client
 from utils.evaluator import BaseEvaluator
 from utils.logger import logger
 from utils.service import register_sut
 IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", None) is None
 UNIT_TEST = os.getenv("UNIT_TEST", 0)
 def main():
    logger.info("执行……")
    dataset_filepath = os.getenv(
        "DATASET_FILEPATH",
        "./tests/resources/en.zip",
    )
    submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "./tests/resources/submit_config")
    result_filepath = os.getenv("RESULT_FILEPATH", "./out/result")
    bad_cases_filepath = os.getenv("BAD_CASES_FILEPATH", "./out/badcase")
    detail_cases_filepath = os.getenv("DETAILED_CASES_FILEPATH", "./out/detailcase.jsonl")
    resource_name = os.getenv("BENCHMARK_NAME")
    # 提交配置 & 启动被测服务
    if os.getenv("DATASET_FILEPATH", ""):
        from utils.helm import resource_check
        with open(submit_config_filepath, "r") as fp:
            st_config = yaml.safe_load(fp)
            st_config["values"] = resource_check(st_config.get("values", {}))
        if 'docker_images' in st_config:
            sut_url = "ws://172.26.1.75:9827"
            os.environ['test'] = '1'
        elif 'docker_image' in st_config:
            sut_url = register_sut(st_config, resource_name)
        elif UNIT_TEST:
            sut_url = "ws://172.27.231.36:80"
        else:
            logger.error("config 配置错误，没有 docker_image")
            os._exit(1)
    else:
        os.environ['test'] = '1'
        sut_url = "ws://172.27.231.36:80"
        if UNIT_TEST:
            exit(0)
    """
    # 数据集处理
    local_dataset_path = "./dataset"
    os.makedirs(local_dataset_path, exist_ok=True)
    with zipfile.ZipFile(dataset_filepath) as zf:
        zf.extractall(local_dataset_path)
    config_path = os.path.join(local_dataset_path, "data.yaml")
    with open(config_path, "r") as fp:
        dataset_config = yaml.safe_load(fp)
    # 数据集信息
    dataset_global_config = dataset_config.get("global", {})
    dataset_query = dataset_config.get("query_data", {})
    evaluator = BaseEvaluator()
    # 开始预测
    for idx, query_item in enumerate(dataset_query):
        gc.collect()
        logger.info(f"开始执行 {idx} 条数据")
        context = ASRContext(**dataset_global_config)
        context.lang = query_item.get("lang", context.lang)
        context.file_path = os.path.join(local_dataset_path, query_item["file"])
        # context.audio_length = query_item["audio_length"]
        interactions = Client(sut_url, context).action()
        context.append_labels(query_item["voice"])
        context.append_preds(
            interactions["predict_data"],
            interactions["send_time"],
            interactions["recv_time"],
        )
        context.fail = interactions["fail"]
        if IN_TEST:
            with open('output.txt', 'w') as fp:
                original_stdout = sys.stdout
                sys.stdout = fp
                print(context)
                sys.stdout = original_stdout
        evaluator.evaluate(context)
        detail_case = evaluator.gen_detail_case()
        with open(detail_cases_filepath, "a") as fp:
            fp.write(json.dumps(detail_case.to_dict(), ensure_ascii=False) + "\n")
        time.sleep(4)
    evaluator.post_evaluate()
    output_result = evaluator.gen_result()
    # print(evaluator.__dict__)
    logger.info("执行完成. Result = {output_result}")
    with open(result_filepath, "w") as fp:
        json.dump(output_result, fp, indent=2, ensure_ascii=False)
    with open(bad_cases_filepath, "w") as fp:
        fp.write("当前榜单不存在 Bad Case\n")
    """
 if __name__ == "__main__":
    main()
--- a/run_async_a10.py
+++ b/run_async_a10.py
@@ -0,0 +1,757 @@
 import atexit
 import concurrent.futures
 import fcntl
 import gc
 import glob
 import json
 import os
 import random
 import signal
 import sys
 import tempfile
 import threading
 import time
 import zipfile
 from concurrent.futures import ThreadPoolExecutor
 import yaml
 from fabric import Connection
 from vmplatform import VMOS, Client, VMDataDisk
 from schemas.context import ASRContext
 from utils.client_async import ClientAsync
 from utils.evaluator import BaseEvaluator
 from utils.logger import logger
 from utils.service import register_sut
 IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", None) is None
 UNIT_TEST = os.getenv("UNIT_TEST", 0)
 DATASET_NUM = os.getenv("DATASET_NUM")
 # vm榜单参数
 SUT_TYPE = os.getenv("SUT_TYPE", "kubernetes")
 SHARE_SUT = os.getenv("SHARE_SUT", "true") == "true"
 VM_ID = 0
 VM_IP = ""
 do_deploy_chart = True
 VM_CPU = int(os.getenv("VM_CPU", "2"))
 VM_MEM = int(os.getenv("VM_MEM", "4096"))
 MODEL_BASEPATH = os.getenv("MODEL_BASEPATH", "/tmp/customer/leaderboard/pc_asr")
 MODEL_MAPPING = json.loads(os.getenv("MODEL_MAPPING", "{}"))
 SSH_KEY_DIR = os.getenv("SSH_KEY_DIR", "/workspace")
 SSH_PUBLIC_KEY_FILE = os.path.join(SSH_KEY_DIR, "ssh-key-ecdsa.pub")
 SSH_KEY_FILE = os.path.join(SSH_KEY_DIR, "ssh-key-ecdsa")
 CONNECT_KWARGS = {"key_filename": SSH_KEY_FILE}
 # 共享sut参数
 JOB_ID = os.getenv("JOB_ID")
 dirname = "/tmp/submit_private/sut_share"
 os.makedirs(dirname, exist_ok=True)
 SUT_SHARE_LOCK = os.path.join(dirname, "lock.lock")
 SUT_SHARE_USE_LOCK = os.path.join(dirname, "use.lock")
 SUT_SHARE_STATUS = os.path.join(dirname, "status.json")
 SUT_SHARE_JOB_STATUS = os.path.join(dirname, f"job_status.{JOB_ID}")
 SUT_SHARE_PUBLIC_FAIL = os.path.join(dirname, "one_job_failed")
 fd_lock = open(SUT_SHARE_USE_LOCK, "a")
 def clean_vm_atexit():
    global VM_ID, do_deploy_chart
    if not VM_ID:
        return
    if not do_deploy_chart:
        return
    logger.info("删除vm")
    vmclient = Client()
    err_msg = vmclient.delete_vm(VM_ID)
    if err_msg:
        logger.warning(f"删除vm失败: {err_msg}")
 def put_file_to_vm(c: Connection, local_path: str, remote_path: str):
    logger.info(f"uploading file {local_path} to {remote_path}")
    result = c.put(local_path, remote_path)
    logger.info("uploaded {0.local} to {0.remote}".format(result))
 def deploy_windows_sut():
    global VM_ID
    global VM_IP
    submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "")
    with open(submit_config_filepath, "r") as fp:
        st_config = yaml.safe_load(fp)
    assert "model" in st_config, "未配置model"
    assert "model_key" in st_config, "未配置model_key"
    assert "config.json" in st_config, "未配置config.json"
    nfs = st_config.get("leaderboard_options", {}).get("nfs", [])
    assert len(nfs) > 0, "未配置nfs"
    assert st_config["model"] in MODEL_MAPPING, "提交模型不在可用模型范围内"
    model = st_config["model"]
    model_key = st_config["model_key"]
    model_path = ""
    config = st_config["config.json"]
    exist = False
    for nfs_item in nfs:
        if nfs_item["name"] == model_key:
            exist = True
            if nfs_item["source"] == "ceph_customer":
                model_path = os.path.join(
                    "/tmp/customer",
                    nfs_item["srcRelativePath"],
                )
            else:
                model_path = os.path.join(
                    "/tmp/juicefs",
                    nfs_item["srcRelativePath"],
                )
            break
    if not exist:
        raise RuntimeError(f"未找到nfs配置项 name={model_key}")
    config_path = os.path.join(tempfile.mkdtemp(), "config.json")
    model_dir = os.path.basename(model_path).split(".")[0]
    config["model_path"] = f"E:\\model\\{model_dir}"
    with open(config_path, "w") as fp:
        json.dump(config, fp, ensure_ascii=False, indent=4)
    vmclient = Client()
    with open(SSH_PUBLIC_KEY_FILE, "r") as fp:
        sshpublickey = fp.read().rstrip()
    VM_ID = vmclient.create_vm(
        "amd64",
        VMOS.windows10,
        VM_CPU,
        VM_MEM,
        "leaderboard-%s-submit-%s-job-%s"
        % (
            os.getenv("BENCHMARK_NAME"),
            os.getenv("SUBMIT_ID"),
            os.getenv("JOB_ID"),
        ),
        sshpublickey,
        datadisks=[
            VMDataDisk(
                size=50,
                disk_type="ssd",
                mount_path="/",
                filesystem="NTFS",
            )
        ],
    )
    atexit.register(clean_vm_atexit)
    signal.signal(signal.SIGTERM, lambda signum, _: sys.exit(signum))
    VM_IP = vmclient.wait_until_vm_running(VM_ID)
    logger.info("vm created successfully, vm_ip: %s", VM_IP)
    def sut_startup():
        with Connection(
            VM_IP,
            "administrator",
            connect_kwargs=CONNECT_KWARGS,
        ) as c:
            script_path = "E:\\base\\asr\\faster-whisper\\server"
            script_path = "E:\\install\\asr\\sensevoice\\server"
            bat_filepath = f"{script_path}\\start.bat"
            config_filepath = "E:\\submit\\config.json"
            result = c.run("")
            assert result.ok
            c.run(
                f'cd /d {script_path} & set "EDGE_ML_ENV_HOME=E:\\install" & {bat_filepath} {config_filepath}',
                warn=True,
            )
    with Connection(
        VM_IP,
        "administrator",
        connect_kwargs=CONNECT_KWARGS,
    ) as c:
        model_filepath = os.path.join(MODEL_BASEPATH, MODEL_MAPPING[model])
        filename = os.path.basename(model_filepath)
        put_file_to_vm(c, model_filepath, "/E:/")
        result = c.run("mkdir E:\\base")
        assert result.ok
        result = c.run("mkdir E:\\model")
        assert result.ok
        result = c.run("mkdir E:\\submit")
        assert result.ok
        result = c.run(
            f"tar zxvf E:\\{filename} -C E:\\base --strip-components 1"
        )
        assert result.ok
        result = c.run("E:\\base\\setup-win.bat E:\\install")
        assert result.ok
        put_file_to_vm(c, config_path, "/E:/submit")
        put_file_to_vm(c, model_path, "/E:/model")
        result = c.run(
            f"tar zxvf E:\\model\\{os.path.basename(model_path)} -C E:\\model"
        )
        assert result.ok
        threading.Thread(target=sut_startup, daemon=True).start()
        time.sleep(60)
    return f"ws://{VM_IP}:{config['port']}"
 def deploy_macos_sut():
    global VM_ID
    global VM_IP
    submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "")
    with open(submit_config_filepath, "r") as fp:
        st_config = yaml.safe_load(fp)
    assert "model" in st_config, "未配置model"
    assert "model_key" in st_config, "未配置model_key"
    assert "config.json" in st_config, "未配置config.json"
    nfs = st_config.get("leaderboard_options", {}).get("nfs", [])
    assert len(nfs) > 0, "未配置nfs"
    assert st_config["model"] in MODEL_MAPPING, "提交模型不在可用模型范围内"
    model = st_config["model"]
    model_key = st_config["model_key"]
    model_path = ""
    config = st_config["config.json"]
    exist = False
    for nfs_item in nfs:
        if nfs_item["name"] == model_key:
            exist = True
            if nfs_item["source"] == "ceph_customer":
                model_path = os.path.join(
                    "/tmp/customer",
                    nfs_item["srcRelativePath"],
                )
            else:
                model_path = os.path.join(
                    "/tmp/juicefs",
                    nfs_item["srcRelativePath"],
                )
            break
    if not exist:
        raise RuntimeError(f"未找到nfs配置项 name={model_key}")
    config_path = os.path.join(tempfile.mkdtemp(), "config.json")
    model_dir = os.path.basename(model_path).split(".")[0]
    vmclient = Client()
    with open(SSH_PUBLIC_KEY_FILE, "r") as fp:
        sshpublickey = fp.read().rstrip()
    VM_ID = vmclient.create_vm(
        "amd64",
        VMOS.macos12,
        VM_CPU,
        VM_MEM,
        "leaderboard-%s-submit-%s-job-%s"
        % (
            os.getenv("BENCHMARK_NAME"),
            os.getenv("SUBMIT_ID"),
            os.getenv("JOB_ID"),
        ),
        sshpublickey,
        datadisks=[
            VMDataDisk(
                size=50,
                disk_type="ssd",
                mount_path="/",
                filesystem="apfs",
            )
        ],
    )
    atexit.register(clean_vm_atexit)
    signal.signal(signal.SIGTERM, lambda signum, _: sys.exit(signum))
    VM_IP = vmclient.wait_until_vm_running(VM_ID)
    logger.info("vm created successfully, vm_ip: %s", VM_IP)
    with Connection(
        VM_IP,
        "admin",
        connect_kwargs=CONNECT_KWARGS,
    ) as c:
        result = c.run("ls -d /Volumes/data*")
        assert result.ok
        volume_path = result.stdout.strip()
    config["model_path"] = f"{volume_path}/model/{model_dir}"
    with open(config_path, "w") as fp:
        json.dump(config, fp, ensure_ascii=False, indent=4)
    def sut_startup():
        with Connection(
            VM_IP,
            "admin",
            connect_kwargs=CONNECT_KWARGS,
        ) as c:
            script_path = f"{volume_path}/install/asr/sensevoice/server"
            startsh = f"{script_path}/start.sh"
            config_filepath = f"{volume_path}/submit/config.json"
            c.run(
                f"cd {script_path} && sh {startsh} {config_filepath}",
                warn=True,
            )
    with Connection(
        VM_IP,
        "admin",
        connect_kwargs=CONNECT_KWARGS,
    ) as c:
        model_filepath = os.path.join(MODEL_BASEPATH, MODEL_MAPPING[model])
        filename = os.path.basename(model_filepath)
        put_file_to_vm(c, model_filepath, f"{volume_path}")
        result = c.run(f"mkdir {volume_path}/base")
        assert result.ok
        result = c.run(f"mkdir {volume_path}/model")
        assert result.ok
        result = c.run(f"mkdir {volume_path}/submit")
        assert result.ok
        result = c.run(
            f"tar zxvf {volume_path}/{filename} -C {volume_path}/base --strip-components 1"  # noqa: E501
        )
        assert result.ok
        result = c.run(
            f"sh {volume_path}/base/setup-mac.sh {volume_path}/install x64"
        )
        assert result.ok
        put_file_to_vm(c, config_path, f"{volume_path}/submit")
        put_file_to_vm(c, model_path, f"{volume_path}/model")
        result = c.run(
            f"tar zxvf {volume_path}/model/{os.path.basename(model_path)} -C {volume_path}/model"  # noqa: E501
        )
        assert result.ok
        threading.Thread(target=sut_startup, daemon=True).start()
        time.sleep(60)
    return f"ws://{VM_IP}:{config['port']}"
 def get_sut_url_vm(vm_type: str):
    global VM_ID
    global VM_IP
    global do_deploy_chart
    do_deploy_chart = True
    # 拉起SUT
    def check_job_failed():
        while True:
            time.sleep(30)
            if os.path.exists(SUT_SHARE_PUBLIC_FAIL):
                logger.error("there is a job failed in current submit")
                sys.exit(1)
    sut_url = ""
    threading.Thread(target=check_job_failed, daemon=True).start()
    if SHARE_SUT:
        time.sleep(10 * random.random())
        try:
            open(SUT_SHARE_LOCK, "x").close()
        except Exception:
            do_deploy_chart = False
        start_at = time.time()
        def file_last_updated_at(file: str):
            return os.stat(file).st_mtime if os.path.exists(file) else start_at
        if not do_deploy_chart:
            with open(SUT_SHARE_JOB_STATUS, "w") as f:
                f.write("waiting")
            while (
                time.time() - file_last_updated_at(SUT_SHARE_STATUS)
                <= 60 * 60 * 24
            ):
                logger.info(
                    "Waiting sut application to be deployed by another job"
                )
                time.sleep(10 + random.random())
                if os.path.exists(SUT_SHARE_STATUS):
                    get_status = False
                    for _ in range(10):
                        try:
                            with open(SUT_SHARE_STATUS, "r") as f:
                                status = json.load(f)
                                get_status = True
                                break
                        except Exception:
                            time.sleep(1 + random.random())
                            continue
                    if not get_status:
                        raise RuntimeError(
                            "Failed to get status of sut application"
                        )
                    assert (
                        status.get("status") != "failed"
                    ), "Failed to deploy sut application, \
 please check other job logs"
                    if status.get("status") == "running":
                        VM_ID = status.get("vmid")
                        VM_IP = status.get("vmip")
                        sut_url = status.get("sut_url")
                        with open(SSH_PUBLIC_KEY_FILE, "w") as fp:
                            fp.write(status.get("pubkey"))
                        with open(SSH_KEY_FILE, "w") as fp:
                            fp.write(status.get("prikey"))
                        logger.info("Successfully get deployed sut application")
                        break
    if do_deploy_chart:
        try:
            fcntl.flock(fd_lock, fcntl.LOCK_EX)
            with open(SUT_SHARE_JOB_STATUS, "w") as f:
                f.write("waiting")
            pending = True
            def update_status():
                while pending:
                    time.sleep(30)
                    if not pending:
                        break
                    with open(SUT_SHARE_STATUS, "w") as f:
                        json.dump({"status": "pending"}, f)
            threading.Thread(target=update_status, daemon=True).start()
            if vm_type == "windows":
                sut_url = deploy_windows_sut()
            else:
                sut_url = deploy_macos_sut()
        except Exception:
            open(SUT_SHARE_PUBLIC_FAIL, "w").close()
            with open(SUT_SHARE_STATUS, "w") as f:
                json.dump({"status": "failed"}, f)
            raise
        finally:
            pending = False
        with open(SUT_SHARE_STATUS, "w") as f:
            pubkey = ""
            with open(SSH_PUBLIC_KEY_FILE, "r") as fp:
                pubkey = fp.read().rstrip()
            prikey = ""
            with open(SSH_KEY_FILE, "r") as fp:
                prikey = fp.read()
            json.dump(
                {
                    "status": "running",
                    "vmid": VM_ID,
                    "vmip": VM_IP,
                    "pubkey": pubkey,
                    "sut_url": sut_url,
                    "prikey": prikey,
                },
                f,
            )
    else:
        while True:
            time.sleep(5 + random.random())
            try:
                fcntl.flock(fd_lock, fcntl.LOCK_EX | fcntl.LOCK_NB)
                break
            except Exception:
                logger.info("尝试抢占调用sut失败，继续等待 5s ...")
    with open(SUT_SHARE_JOB_STATUS, "w") as f:
        f.write("running")
    return sut_url
 def get_sut_url():
    if SUT_TYPE in ("windows", "macos"):
        return get_sut_url_vm(SUT_TYPE)
    submit_config_filepath = os.getenv(
        "SUBMIT_CONFIG_FILEPATH", "./tests/resources/submit_config"
    )
    CPU = os.getenv("SUT_CPU", "2")
    MEMORY = os.getenv("SUT_MEMORY", "4Gi")
    resource_name = os.getenv("BENCHMARK_NAME")
    # 任务信息
    # 斯拉夫语族：俄语、波兰语
    # 日耳曼语族：英语、德语、荷兰语
    # 拉丁语族（罗曼语族）：西班牙语、葡萄牙语、法国语、意大利语
    # 闪米特语族：阿拉伯语、希伯来语
    # 提交配置 & 启动被测服务
    if os.getenv("DATASET_FILEPATH", ""):
        with open(submit_config_filepath, "r") as fp:
            st_config = yaml.safe_load(fp)
            if "values" not in st_config:
                st_config["values"] = {}
            st_config["values"]["resources"] = {}
            st_config["values"]["resources"]["limits"] = {}
            st_config["values"]["resources"]["limits"]["cpu"] = CPU
            st_config["values"]["resources"]["limits"]["memory"] = MEMORY
            # st_config["values"]['resources']['limits']['nvidia.com/gpu'] = '1'
            # st_config["values"]['resources']['limits']['nvidia.com/gpumem'] = "1843"
            # st_config["values"]['resources']['limits']['nvidia.com/gpucores'] = "8"
            st_config["values"]["resources"]["requests"] = {}
            st_config["values"]["resources"]["requests"]["cpu"] = CPU
            st_config["values"]["resources"]["requests"]["memory"] = MEMORY
            # st_config["values"]['resources']['requests']['nvidia.com/gpu'] = '1'
            # st_config["values"]['resources']['requests']['nvidia.com/gpumem'] = "1843"
            # st_config["values"]['resources']['requests']['nvidia.com/gpucores'] = "8"
            # st_config['values']['nodeSelector'] = {}
            # st_config["values"]["nodeSelector"][
            #     "contest.4pd.io/accelerator"
            # ] = "A10vgpu"
            # st_config['values']['tolerations'] = []
            # toleration_item = {}
            # toleration_item['key'] = 'hosttype'
            # toleration_item['operator'] = 'Equal'
            # toleration_item['value'] = 'vgpu'
            # toleration_item['effect'] = 'NoSchedule'
            # st_config['values']['tolerations'].append(toleration_item)
            if os.getenv("RESOURCE_TYPE", "cpu") == "cpu":
                values = st_config["values"]
                limits = values.get("resources", {}).get("limits", {})
                requests = values.get("resources", {}).get("requests", {})
                if (
                    "nvidia.com/gpu" in limits
                    or "nvidia.com/gpumem" in limits
                    or "nvidia.com/gpucores" in limits
                    or "nvidia.com/gpu" in requests
                    or "nvidia.com/gpumem" in requests
                    or "nvidia.com/gpucores" in requests
                ):
                    raise Exception("禁止使用GPU!")
            else:
                vgpu_num = int(os.getenv("SUT_VGPU", "3"))
                st_config["values"]["resources"]["limits"]["nvidia.com/gpu"] = (
                    str(vgpu_num)
                )
                st_config["values"]["resources"]["limits"][
                    "nvidia.com/gpumem"
                ] = str(1843 * vgpu_num)
                st_config["values"]["resources"]["limits"][
                    "nvidia.com/gpucores"
                ] = str(8 * vgpu_num)
                st_config["values"]["resources"]["requests"][
                    "nvidia.com/gpu"
                ] = str(vgpu_num)
                st_config["values"]["resources"]["requests"][
                    "nvidia.com/gpumem"
                ] = str(1843 * vgpu_num)
                st_config["values"]["resources"]["requests"][
                    "nvidia.com/gpucores"
                ] = str(8 * vgpu_num)
                st_config["values"]["nodeSelector"] = {}
                st_config["values"]["nodeSelector"][
                    "contest.4pd.io/accelerator"
                ] = "A10vgpu"
                st_config["values"]["tolerations"] = []
                toleration_item = {}
                toleration_item["key"] = "hosttype"
                toleration_item["operator"] = "Equal"
                toleration_item["value"] = "vgpu"
                toleration_item["effect"] = "NoSchedule"
                st_config["values"]["tolerations"].append(toleration_item)
        if "docker_images" in st_config:
            sut_url = "ws://172.26.1.75:9827"
            os.environ["test"] = "1"
        elif "docker_image" in st_config:
            sut_url = register_sut(st_config, resource_name)
        elif UNIT_TEST:
            sut_url = "ws://172.27.231.36:80"
        else:
            logger.error("config 配置错误，没有 docker_image")
            os._exit(1)
        return sut_url
    else:
        os.environ["test"] = "1"
        sut_url = "ws://172.27.231.36:80"
        sut_url = "ws://172.26.1.75:9827"
        return sut_url
 def load_merge_dataset(dataset_filepath: str) -> dict:
    local_dataset_path = "./dataset"
    os.makedirs(local_dataset_path, exist_ok=True)
    with zipfile.ZipFile(dataset_filepath) as zf:
        zf.extractall(local_dataset_path)
    config = {}
    sub_datasets = os.listdir(local_dataset_path)
    for sub_dataset in sub_datasets:
        if sub_dataset.startswith("asr."):
            lang = sub_dataset[4:]
            lang_path = os.path.join(local_dataset_path, lang)
            os.makedirs(lang_path, exist_ok=True)
            with zipfile.ZipFile(
                os.path.join(local_dataset_path, sub_dataset)
            ) as zf:
                zf.extractall(lang_path)
            lang_config_path = os.path.join(lang_path, "data.yaml")
            with open(lang_config_path, "r") as fp:
                lang_config = yaml.safe_load(fp)
                audio_lengths = {}
                for query_item in lang_config.get("query_data", []):
                    audio_path = os.path.join(
                        lang_path,
                        query_item["file"],
                    )
                    query_item["file"] = audio_path
                    audio_lengths[query_item["file"]] = os.path.getsize(
                        audio_path,
                    )
                lang_config["query_data"] = sorted(
                    lang_config.get("query_data", []),
                    key=lambda x: audio_lengths[x["file"]],
                    reverse=True,
                )
                idx = 0
                length = 0.0
                for query_item in lang_config["query_data"]:
                    audio_length = audio_lengths[query_item["file"]]
                    length += audio_length / 32000
                    idx += 1
                    # 每个语言限制半个小时长度
                    if length >= 30 * 60:
                        break
                lang_config["query_data"] = lang_config["query_data"][:idx]
                config[lang] = lang_config
    config["query_data"] = []
    for lang, lang_config in config.items():
        if lang == "query_data":
            continue
        for query_item in lang_config["query_data"]:
            config["query_data"].append(
                {
                    **query_item,
                    "lang": lang,
                }
            )
    random.Random(0).shuffle(config["query_data"])
    return config
 def postprocess_failed():
    open(SUT_SHARE_PUBLIC_FAIL, "w").close()
 def main():
    dataset_filepath = os.getenv(
        "DATASET_FILEPATH",
        "/Users/4paradigm/Projects/dataset/asr/de.zip",
        # "./tests/resources/en.zip",
    )
    result_filepath = os.getenv("RESULT_FILEPATH", "./out/result")
    bad_cases_filepath = os.getenv("BAD_CASES_FILEPATH", "./out/badcase")
    detail_cases_filepath = os.getenv(
        "DETAILED_CASES_FILEPATH", "./out/detailcase.jsonl"
    )
    thread_num = int(os.getenv("THREAD_NUM", "1"))
    # 数据集处理
    config = {}
    if os.getenv("MERGE_DATASET", "1"):
        config = load_merge_dataset(dataset_filepath)
        dataset_query = config["query_data"]
    else:
        local_dataset_path = "./dataset"
        os.makedirs(local_dataset_path, exist_ok=True)
        with zipfile.ZipFile(dataset_filepath) as zf:
            zf.extractall(local_dataset_path)
        config_path = os.path.join(local_dataset_path, "data.yaml")
        with open(config_path, "r") as fp:
            dataset_config = yaml.safe_load(fp)
        # 读取所有的音频，进而获得音频的总长度，最后按照音频长度对 query_data 进行降序排序
        lang = os.getenv("lang")
        if lang is None:
            lang = dataset_config.get("global", {}).get("lang", "en")
        audio_lengths = []
        for query_item in dataset_config.get("query_data", []):
            query_item["lang"] = lang
            audio_path = os.path.join(local_dataset_path, query_item["file"])
            query_item["file"] = audio_path
            audio_lengths.append(os.path.getsize(audio_path) / 1024 / 1024)
        dataset_config["query_data"] = sorted(
            dataset_config.get("query_data", []),
            key=lambda x: audio_lengths[dataset_config["query_data"].index(x)],
            reverse=True,
        )
        # 数据集信息
        # dataset_global_config = dataset_config.get("global", {})
        dataset_query = dataset_config.get("query_data", {})
        config[lang] = dataset_config
    # sut url
    sut_url = get_sut_url()
    try:
        # 开始测试
        logger.info("开始执行")
        evaluator = BaseEvaluator()
        future_list = []
        with ThreadPoolExecutor(max_workers=thread_num) as executor:
            for idx, query_item in enumerate(dataset_query):
                context = ASRContext(
                    **config[query_item["lang"]].get("global", {}),
                )
                context.lang = query_item["lang"]
                context.file_path = query_item["file"]
                context.append_labels(query_item["voice"])
                future = executor.submit(
                    ClientAsync(sut_url, context, idx).action
                )
                future_list.append(future)
        for future in concurrent.futures.as_completed(future_list):
            context = future.result()
            evaluator.evaluate(context)
            detail_case = evaluator.gen_detail_case()
            with open(detail_cases_filepath, "a") as fp:
                fp.write(
                    json.dumps(
                        detail_case.to_dict(),
                        ensure_ascii=False,
                    )
                    + "\n",
                )
            del context
            gc.collect()
        evaluator.post_evaluate()
        output_result = evaluator.gen_result()
        logger.info("执行完成")
        with open(result_filepath, "w") as fp:
            json.dump(output_result, fp, indent=2, ensure_ascii=False)
        with open(bad_cases_filepath, "w") as fp:
            fp.write("当前榜单不存在 Bad Case\n")
        if SHARE_SUT:
            with open(SUT_SHARE_JOB_STATUS, "w") as f:
                f.write("success")
            fcntl.flock(fd_lock, fcntl.LOCK_UN)
            fd_lock.close()
            while SHARE_SUT and do_deploy_chart:
                time.sleep(30)
                success_num = 0
                for job_status_file in glob.glob(dirname + "/job_status.*"):
                    with open(job_status_file, "r") as f:
                        job_status = f.read()
                        success_num += job_status == "success"
                if success_num == int(DATASET_NUM):
                    break
                logger.info("Waiting for all jobs to complete")
    except Exception:
        if SHARE_SUT:
            postprocess_failed()
        raise
    sys.exit(0)
 if __name__ == "__main__":
    main()
--- a/run_callback.py
+++ b/run_callback.py
@@ -0,0 +1,923 @@
 import json
 import os
 import sys
 import time
 import tempfile
 import zipfile
 import threading
 from collections import defaultdict
 from typing import Dict, List
 import yaml
 from pydantic import ValidationError
 from schemas.dataset import QueryData
 from utils.client_callback import ClientCallback, EvaluateResult, StopException
 from utils.logger import log
 from utils.service import register_sut
 from utils.update_submit import change_product_available
 from utils.file import dump_json, load_yaml, unzip_dir, load_json, write_file, dump_yaml
 from utils.leaderboard import change_product_unavailable
 lck = threading.Lock()
 # Environment variables by leaderboard
 DATASET_FILEPATH = os.environ["DATASET_FILEPATH"]
 RESULT_FILEPATH = os.environ["RESULT_FILEPATH"]
 DETAILED_CASES_FILEPATH = os.environ["DETAILED_CASES_FILEPATH"]
 SUBMIT_CONFIG_FILEPATH = os.environ["SUBMIT_CONFIG_FILEPATH"]
 BENCHMARK_NAME = os.environ["BENCHMARK_NAME"]
 TEST_CONCURRENCY = int(os.getenv('TEST_CONCURRENCY', 1))
 THRESHOLD_OMCER = float(os.getenv('THRESHOLD_OMCER', 0.8))
 log.info(f"DATASET_FILEPATH: {DATASET_FILEPATH}")
 workspace_path = "/tmp/workspace"
 # Environment variables by kubernetes
 MY_POD_IP = os.environ["MY_POD_IP"]
 # constants
 RESOURCE_NAME = BENCHMARK_NAME
 # Environment variables by judge_flow_config
 LANG = os.getenv("lang")
 SUT_CPU = os.getenv("SUT_CPU", "2")
 SUT_MEMORY = os.getenv("SUT_MEMORY", "4Gi")
 SUT_VGPU = os.getenv("SUT_VGPU", "1")
 #SUT_VGPU_MEM = os.getenv("SUT_VGPU_MEM", str(1843 * int(SUT_VGPU)))
 #SUT_VGPU_CORES = os.getenv("SUT_VGPU_CORES", str(8 * int(SUT_VGPU)))
 SUT_VGPU_ACCELERATOR = os.getenv("SUT_VGPU_ACCELERATOR", "iluvatar-BI-V100")
 RESOURCE_TYPE = os.getenv("RESOURCE_TYPE", "vgpu")
 assert RESOURCE_TYPE in [
    "cpu",
    "vgpu",
 ], "benchmark judge_flow_config error: RESOURCE_TYPE should be cpu or vgpu"
 unzip_dir(DATASET_FILEPATH, workspace_path)
 def get_sut_url_kubernetes():
    with open(SUBMIT_CONFIG_FILEPATH, "r") as f:
        submit_config = yaml.safe_load(f)
        assert isinstance(submit_config, dict)
    submit_config.setdefault("values", {})
    submit_config["values"]["containers"] = [
        {
            "name": "corex-container",
            "image": "harbor.4pd.io/lab-platform/inf/python:3.9",  #镜像
            "command": ["sleep"],  # 替换为你的模型启动命令，使用python解释器
            "args": ["3600"],  # 替换为你的模型参数，运行我的推理脚本
            # 添加存储卷挂载
            #"volumeMounts": [
            #    {
            #        "name": "model-volume",
            #        "mountPath": "/model"  # 挂载到/model目录
            #    }
            #]
        }
    ]
    """
    # 添加存储卷配置
    submit_config["values"]["volumes"] = [
        {
            "name": "model-volume",
            "persistentVolumeClaim": {
                "claimName": "sid-model-pvc"  # 使用已有的PVC
            }
        }
    ]
    """
    """
    # Inject specified cpu and memory
    resource = {
        "cpu": SUT_CPU,
        "memory": SUT_MEMORY,
    }
    """
    submit_config["values"]["resources"] = {
        "requests":{},
        "limits": {},
    }
    limits = submit_config["values"]["resources"]["limits"]
    requests = submit_config["values"]["resources"]["requests"]
    """
    # ########## 关键修改：替换为iluvatar GPU配置 ##########
    if RESOURCE_TYPE == "vgpu":  # 假设你的模型需要GPU
        # 替换nvidia资源键为iluvatar.ai/gpu
        vgpu_resource = {
            "iluvatar.ai/gpu": SUT_VGPU,  # 对应你的GPU资源键
            # 若需要其他资源（如显存），按你的K8s配置补充，例如：
            # "iluvatar.ai/gpumem": SUT_VGPU_MEM,
        }
        limits.update(vgpu_resource)
        requests.update(vgpu_resource)
        # 节点选择器：替换为你的accelerator标签
        submit_config["values"]["nodeSelector"] = {
            "contest.4pd.io/accelerator": "iluvatar-BI-V100"  # 你的节点标签
        }
        # 容忍度：替换为你的tolerations配置
        submit_config["values"]["tolerations"] = [
            {
                "key": "hosttype",
                "operator": "Equal",
                "value": "iluvatar",
                "effect": "NoSchedule",
            }
        ]
    # #########################################
    # 禁止CPU模式下使用GPU资源（保持原逻辑）
    else:
        if "iluvatar.ai/gpu" in limits or "iluvatar.ai/gpu" in requests:
            log.error("禁止在CPU模式下使用GPU资源")
            sys.exit(1)
        #gpukeys = ["iluvatar.ai/gpu"]  # 检查iluvatar GPU键
        #for key in gpukeys:
        #    if key in limits or key in requests:
        #        log.error("禁止使用vgpu资源")
        #        sys.exit(1)
    """
    # 替换nvidia资源键为iluvatar.ai/gpu
    vgpu_resource = {
        "iluvatar.ai/gpu": SUT_VGPU,  # 对应你的GPU资源键
            # 若需要其他资源（如显存），按你的K8s配置补充，例如：
            # "iluvatar.ai/gpumem": SUT_VGPU_MEM,
    }
    limits.update(vgpu_resource)
    requests.update(vgpu_resource)
        # 节点选择器：替换为你的accelerator标签
    submit_config["values"]["nodeSelector"] = {
        "contest.4pd.io/accelerator": "iluvatar-BI-V100"  # 你的节点标签
    }
        # 容忍度：替换为你的tolerations配置
    """
    submit_config["values"]["tolerations"] = [
        {
            "key": "hosttype",
            "operator": "Equal",
            "value": "iluvatar",
            "effect": "NoSchedule",
        },
        {
            "key": "hosttype",
            "operator": "Equal",
            "value": "arm64",
            "effect": "NoSchedule",
        },
        {
            "key": "hosttype",
            "operator": "Equal",
            "value": "myinit",
            "effect": "NoSchedule",
        },
        {
            "key": "hosttype",
            "operator": "Equal",
            "value": "middleware",
            "effect": "NoSchedule",
        }
    ]
    """
    """
        {
            "key": "node-role.kubernetes.io/master",
            "operator": "Exists",
            "effect": "NoSchedule",
        },
        {
            "key": "node.kubernetes.io/not-ready",
            "operator": "Exists",
            "effect": "NoExecute",
            "tolerationSeconds": 300
        },
        {
            "key": "node.kubernetes.io/unreachable",
            "operator": "Exists",
            "effect": "NoExecute",
            "tolerationSeconds": 300
        }
    """
    log.info(f"submit_config: {submit_config}")
    log.info(f"RESOURCE_NAME: {RESOURCE_NAME}")
    return register_sut(submit_config, RESOURCE_NAME).replace(
        "ws://", "http://"
    )
 def get_sut_url():
    return get_sut_url_kubernetes()
 #SUT_URL = get_sut_url()
 #os.environ["SUT_URL"] = SUT_URL
 #############################################################################
 import requests
 import base64
 def gen_req_body(apiname, APPId, file_path=None, featureId=None, featureInfo=None, dstFeatureId=None):
    """
    生成请求的body
    :param apiname
    :param APPId: Appid
    :param file_name:  文件路径
    :return:
    """
    if apiname == 'createFeature':
        with open(file_path, "rb") as f:
            audioBytes = f.read()
        body = {
            "header": {
                "app_id": APPId,
                "status": 3
            },
            "parameter": {
                "s782b4996": {
                    "func": "createFeature",
                    "groupId": "test_voiceprint_e",
                    "featureId": featureId,
                    "featureInfo":  featureInfo,
                    "createFeatureRes": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            },
            "payload": {
                "resource": {
                    "encoding": "lame",
                    "sample_rate": 16000,
                    "channels": 1,
                    "bit_depth": 16,
                    "status": 3,
                    "audio": str(base64.b64encode(audioBytes), 'UTF-8')
                }
            }
        }
    elif apiname == 'createGroup':
        body = {
            "header": {
                "app_id": APPId,
                "status": 3
            },
            "parameter": {
                "s782b4996": {
                    "func": "createGroup",
                    "groupId": "test_voiceprint_e",
                    "groupName": "vip_user",
                    "groupInfo": "store_vip_user_voiceprint",
                    "createGroupRes": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            }
        }
    elif apiname == 'deleteFeature':
        body = {
            "header": {
                "app_id": APPId,
                "status": 3
            },
            "parameter": {
                "s782b4996": {
                    "func": "deleteFeature",
                    "groupId": "iFLYTEK_examples_groupId",
                    "featureId": "iFLYTEK_examples_featureId",
                    "deleteFeatureRes": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            }
        }
    elif apiname == 'queryFeatureList':
        body = {
            "header": {
                "app_id": APPId,
                "status": 3
            },
            "parameter": {
                "s782b4996": {
                    "func": "queryFeatureList",
                    "groupId": "user_voiceprint_2",
                    "queryFeatureListRes": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            }
        }
    elif apiname == 'searchFea':
        with open(file_path, "rb") as f:
            audioBytes = f.read()
        body = {
            "header": {
                "app_id": APPId,
                "status": 3
            },
            "parameter": {
                "s782b4996": {
                    "func": "searchFea",
                    "groupId": "test_voiceprint_e",
                    "topK": 1,
                    "searchFeaRes": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            },
            "payload": {
                "resource": {
                    "encoding": "lame",
                    "sample_rate": 16000,
                    "channels": 1,
                    "bit_depth": 16,
                    "status": 3,
                    "audio": str(base64.b64encode(audioBytes), 'UTF-8')
                }
            }
        }
    elif apiname == 'searchScoreFea':
        with open(file_path, "rb") as f:
            audioBytes = f.read()
        body = {
            "header": {
                "app_id": APPId,
                "status": 3
            },
            "parameter": {
                "s782b4996": {
                    "func": "searchScoreFea",
                    "groupId": "test_voiceprint_e",
                    "dstFeatureId": dstFeatureId,
                    "searchScoreFeaRes": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            },
            "payload": {
                "resource": {
                    "encoding": "lame",
                    "sample_rate": 16000,
                    "channels": 1,
                    "bit_depth": 16,
                    "status": 3,
                    "audio": str(base64.b64encode(audioBytes), 'UTF-8')
                }
            }
        }
    elif apiname == 'updateFeature':
        with open(file_path, "rb") as f:
            audioBytes = f.read()
        body = {
            "header": {
                "app_id": APPId,
                "status": 3
            },
            "parameter": {
                "s782b4996": {
                    "func": "updateFeature",
                    "groupId": "iFLYTEK_examples_groupId",
                    "featureId": "iFLYTEK_examples_featureId",
                    "featureInfo": "iFLYTEK_examples_featureInfo_update",
                    "updateFeatureRes": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            },
            "payload": {
                "resource": {
                    "encoding": "lame",
                    "sample_rate": 16000,
                    "channels": 1,
                    "bit_depth": 16,
                    "status": 3,
                    "audio": str(base64.b64encode(audioBytes), 'UTF-8')
                }
            }
        }
    elif apiname == 'deleteGroup':
        body = {
            "header": {
                "app_id": APPId,
                "status": 3
            },
            "parameter": {
                "s782b4996": {
                    "func": "deleteGroup",
                    "groupId": "iFLYTEK_examples_groupId",
                    "deleteGroupRes": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            }
        }
    else:
        raise Exception(
            "输入的apiname不在[createFeature, createGroup, deleteFeature, queryFeatureList, searchFea, searchScoreFea,updateFeature]内，请检查")
    return body
 log.info(f"开始请求获取到SUT服务URL")
 # 获取SUT服务URL
 sut_url = get_sut_url()
 print(f"获取到的SUT_URL: {sut_url}")  # 调试输出
 log.info(f"获取到SUT服务URL: {sut_url}")
 from urllib.parse import urlparse
 # 全局变量
 text_decoded = None
 ###################################新增新增################################
 def req_url(api_name, APPId, file_path=None, featureId=None, featureInfo=None, dstFeatureId=None):
    """
    开始请求
    :param APPId: APPID
    :param file_path: body里的文件路径
    :return:
    """
    global text_decoded
    body = gen_req_body(apiname=api_name, APPId=APPId, file_path=file_path, featureId=featureId, featureInfo=featureInfo, dstFeatureId=dstFeatureId)
    #request_url = 'https://ai-cloud.4paradigm.com:9443/sid/v1/private/s782b4996'
    #request_url = 'https://sut:80/sid/v1/private/s782b4996'
    #headers = {'content-type': "application/json", 'host': 'ai-cloud.4paradigm.com', 'appid': APPId}
    parsed_url = urlparse(sut_url)
    headers = {'content-type': "application/json", 'host': parsed_url.hostname, 'appid': APPId}
    # 1. 首先测试服务健康检查
    response = requests.get(f"{sut_url}/health")
    print(response.status_code, response.text)
    # 请求头
    headers = {"Content-Type": "application/json"}
    # 请求体（可指定限制处理的图片数量）
    body = {"limit": 20 } # 可选参数，限制处理的图片总数
    # 发送POST请求
    response = requests.post(
        f"{sut_url}/v1/private/s782b4996",
        data=json.dumps(body),
        headers=headers
    )   
    # 解析响应结果
    if response.status_code == 200:
        result = response.json()
        print("预测评估结果:")
        print(f"准确率: {result['metrics']['accuracy']}%")
        print(f"平均召回率: {result['metrics']['average_recall']}%")
        print(f"处理图片总数: {result['metrics']['total_images']}")
    else:
        print(f"请求失败，状态码: {response.status_code}")
        print(f"错误信息: {response.text}")            
    # 添加基本认证信息
    auth = ('llm', 'Rmf4#LcG(iFZrjU;2J')
    #response = requests.post(request_url, data=json.dumps(body), headers=headers, auth=auth)
    #response = requests.post(sut_url + "/predict", data=json.dumps(body), headers=headers, auth=auth)
    #response = requests.post(f"{sut_url}/sid/v1/private/s782b4996", data=json.dumps(body), headers=headers, auth=auth)
    """
    response = requests.post(f"{sut_url}/v1/private/s782b4996", data=json.dumps(body), headers=headers)
    """
    #print("HTTP状态码:", response.status_code)
    #print("原始响应内容:", response.text)  # 先打印原始内容
    #print(f"请求URL: {sut_url + '/v1/private/s782b4996'}")
    #print(f"请求headers: {headers}")
    #print(f"请求body: {body}")
    #tempResult = json.loads(response.content.decode('utf-8'))
    #print(tempResult)
    """
    # 对text字段进行Base64解码
    if 'payload' in tempResult and 'updateFeatureRes' in tempResult['payload']:
        text_encoded = tempResult['payload']['updateFeatureRes']['text']
        text_decoded = base64.b64decode(text_encoded).decode('utf-8')
        print(f"Base64解码后的text字段内容: {text_decoded}")
    """
    #text_encoded = tempResult['payload']['updateFeatureRes']['text']
    #text_decoded = base64.b64decode(text_encoded).decode('utf-8')
    #print(f"Base64解码后的text字段内容: {text_decoded}")        
    # 获取响应的 JSON 数据
    result = response.json()
    with open(RESULT_FILEPATH, "w") as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    print(f"结果已成功写入 {RESULT_FILEPATH}")
 submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "./tests/resources/submit_config")
 result_filepath = os.getenv("RESULT_FILEPATH", "./out/result")
 bad_cases_filepath = os.getenv("BAD_CASES_FILEPATH", "./out/badcase")
 #detail_cases_filepath = os.getenv("DETAILED_CASES_FILEPATH", "./out/detailcase.jsonl")
 from typing import Any, Dict, List
 def result2file(
    result: Dict[str, Any],
    detail_cases: List[Dict[str, Any]] = None
 ):
    assert result_filepath is not None
    assert bad_cases_filepath is not None
    #assert detailed_cases_filepath is not None
    if result is not None:
        with open(result_filepath, "w") as f:
            json.dump(result, f, indent=4, ensure_ascii=False)
        #if LOCAL_TEST:
        #    logger.info(f'result:\n {json.dumps(result, indent=4)}')
    """
    if detail_cases is not None:
        with open(detailed_cases_filepath, "w") as f:
            json.dump(detail_cases, f, indent=4, ensure_ascii=False)
        if LOCAL_TEST:
            logger.info(f'result:\n {json.dumps(detail_cases, indent=4)}')
    """
 def test_image_prediction(sut_url, image_path):
    """发送单张图片到服务端预测"""
    url = f"{sut_url}/v1/private/s782b4996"
    try:
        with open(image_path, 'rb') as f:
            files = {'image': f}
            response = requests.post(url, files=files, timeout=30)
        result = response.json()
        if result.get('status') != 'success':
            return None, f"服务端错误: {result.get('message')}"
        return result, None
    except Exception as e:
        return None, f"请求错误: {str(e)}"
 import random
 import time
 #from tqdm import tqdm
 import os
 import requests
 if __name__ == '__main__':
    print(f"\n===== main开始请求接口 ===============================================")
    # 1. 首先测试服务健康检查
    print(f"\n===== 服务健康检查 ===================================================")
    response = requests.get(f"{sut_url}/health")
    print(response.status_code, response.text)
    """
    # 本地图片路径和真实标签（根据实际情况修改）
    image_path = "/path/to/your/test_image.jpg"
    true_label = "cat"  # 图片的真实标签
    """
    """
    # 请求头
    headers = {"Content-Type": "application/json"}
    # 请求体（可指定限制处理的图片数量）
    body = {"limit": 20 } # 可选参数，限制处理的图片总数
    # 发送POST请求
    response = requests.post(
        f"{sut_url}/v1/private/s782b4996",
        data=json.dumps(body),
        headers=headers
    )  
    """
    """
    # 读取图片文件
    with open(image_path, 'rb') as f:
        files = {'image': f}
        # 发送POST请求
        response = requests.post(f"{sut_url}/v1/private/s782b4996", files=files)
    # 解析响应结果
    if response.status_code == 200:
        result = response.json()
        print("预测评估结果:")
        print(f"准确率: {result['metrics']['accuracy']}%")
        print(f"平均召回率: {result['metrics']['average_recall']}%")
        print(f"处理图片总数: {result['metrics']['total_images']}")
    else:
        print(f"请求失败，状态码: {response.status_code}")
        print(f"错误信息: {response.text}")   
    """
    ###############################################################################################
    dataset_root = "/tmp/workspace/256ObjectCategoriesNew"  # 数据集根目录
    samples_per_class = 3  # 每个类别抽取的样本数
    image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')  # 支持的图片格式
    # 结果统计变量
    total_samples = 0
    #correct_predictions = 0
    # GPU统计
    gpu_true_positives = 0
    gpu_false_positives = 0
    gpu_false_negatives = 0
    gpu_total_processing_time = 0.0
    # CPU统计
    cpu_true_positives = 0
    cpu_false_positives = 0
    cpu_false_negatives = 0
    cpu_total_processing_time = 0.0
    """
    # 遍历所有类别文件夹
    for folder_name in tqdm(os.listdir(dataset_root), desc="处理类别"):
        folder_path = os.path.join(dataset_root, folder_name)
        # 提取类别名（从"序号.name"格式中提取name部分）
        class_name = folder_name.split('.', 1)[1].strip().lower()
        # 获取文件夹中所有图片
        image_files = []
        for file in os.listdir(folder_path):
            if file.lower().endswith(image_extensions):
                image_files.append(os.path.join(folder_path, file))
        # 随机抽取指定数量的图片（如果不足则取全部）
        selected_images = random.sample(
            image_files, 
            min(samples_per_class, len(image_files))
        )
        # 处理选中的图片
        for img_path in selected_images:
            total_count += 1
            # 发送预测请求
            prediction, error = test_image_prediction(sut_url, img_path)
            if error:
                print(f"处理图片 {img_path} 失败: {error}")
                continue
            # 解析预测结果
            pred_class = prediction.get('class_name', '').lower()
            confidence = prediction.get('confidence', 0)
            # 判断是否预测正确（真实类别是否在预测类别中）
            if class_name in pred_class:
                correct_predictions += 1
            # 可选：打印详细结果
            print(f"图片: {os.path.basename(img_path)} | 真实: {class_name} | 预测: {pred_class} | 置信度: {confidence:.4f} | {'正确' if is_correct else '错误'}")
    """
    # 遍历所有类别文件夹
    for folder_name in os.listdir(dataset_root):
        folder_path = os.path.join(dataset_root, folder_name)
        # 跳过非文件夹的项目
        if not os.path.isdir(folder_path):
            continue
        # 提取类别名（从"序号.name"格式中提取name部分）
        try:
            class_name = folder_name.split('.', 1)[1].strip().lower()
        except IndexError:
            print(f"警告：文件夹 {folder_name} 命名格式不正确，跳过该文件夹")
            continue
        # 获取文件夹中所有图片
        image_files = []
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if os.path.isfile(file_path) and file.lower().endswith(image_extensions):
                image_files.append(file_path)
        # 随机抽取指定数量的图片（如果不足则取全部）
        selected_images = random.sample(
            image_files, 
            min(samples_per_class, len(image_files))
        )
        for img_path in selected_images:
            total_samples += 1
            # 获取预测结果
            prediction, error = test_image_prediction(sut_url, img_path)
            # 打印test_image_prediction返回的结果
            print(f"test_image_prediction返回的prediction: {prediction}")
            print(f"test_image_prediction返回的error: {error}")
            if error:
                print(f"处理图片 {img_path} 失败: {error}")
                continue
            # 解析GPU预测结果
            gpu_pred = prediction.get('cuda_prediction', {})
            gpu_pred_class = gpu_pred.get('class_name', '').lower()
            gpu_processing_time = gpu_pred.get('processing_time', 0.0)
            # 解析CPU预测结果
            cpu_pred = prediction.get('cpu_prediction', {})
            cpu_pred_class = cpu_pred.get('class_name', '').lower()
            cpu_processing_time = cpu_pred.get('processing_time', 0.0)
            # 判断GPU预测是否正确
            gpu_is_correct = class_name in gpu_pred_class
            if gpu_is_correct:
                gpu_true_positives += 1
            else:
                gpu_false_positives += 1
                gpu_false_negatives += 1
            # 判断CPU预测是否正确
            cpu_is_correct = class_name in cpu_pred_class
            if cpu_is_correct:
                cpu_true_positives += 1
            else:
                cpu_false_positives += 1
                cpu_false_negatives += 1
            # 累加处理时间
            gpu_total_processing_time += gpu_processing_time
            cpu_total_processing_time += cpu_processing_time
            # 打印详细结果
            print(f"图片: {os.path.basename(img_path)} | 真实: {class_name}")
            print(f"GPU预测: {gpu_pred_class} | {'正确' if gpu_is_correct else '错误'} | 耗时: {gpu_processing_time:.6f}s")
            print(f"CPU预测: {cpu_pred_class} | {'正确' if cpu_is_correct else '错误'} | 耗时: {cpu_processing_time:.6f}s")
            print("-" * 50)
    """
    # 计算整体指标（在单标签场景下，准确率=召回率）
    if total_samples == 0:
        overall_accuracy = 0.0
        overall_recall = 0.0
    else:
        overall_accuracy = correct_predictions / total_samples
        overall_recall = correct_predictions / total_samples  # 整体召回率
    # 输出统计结果
    print("\n" + "="*50)
    print(f"测试总结:")
    print(f"总测试样本数: {total_samples}")
    print(f"正确预测样本数: {correct_predictions}")
    print(f"整体准确率: {overall_accuracy:.4f} ({correct_predictions}/{total_samples})")
    print(f"整体召回率: {overall_recall:.4f} ({correct_predictions}/{total_samples})")
    print("="*50)
    """
    # 初始化结果字典
    result = {
        # GPU指标
        "gpu_accuracy": 0.0,
        "gpu_recall": 0.0,
        "gpu_running_time": round(gpu_total_processing_time, 6),
        "gpu_throughput": 0.0,
        # CPU指标
        "cpu_accuracy": 0.0,
        "cpu_recall": 0.0,
        "cpu_running_time": round(cpu_total_processing_time, 6),
        "cpu_throughput": 0.0
    }
    # 计算GPU指标
    gpu_accuracy = gpu_true_positives / total_samples * 100
    gpu_recall_denominator = gpu_true_positives + gpu_false_negatives
    gpu_recall = gpu_true_positives / gpu_recall_denominator * 100 if gpu_recall_denominator > 0 else 0
    gpu_throughput = total_samples / gpu_total_processing_time if gpu_total_processing_time > 1e-6 else 0
    # 计算CPU指标
    cpu_accuracy = cpu_true_positives / total_samples * 100
    cpu_recall_denominator = cpu_true_positives + cpu_false_negatives
    cpu_recall = cpu_true_positives / cpu_recall_denominator * 100 if cpu_recall_denominator > 0 else 0
    cpu_throughput = total_samples / cpu_total_processing_time if cpu_total_processing_time > 1e-6 else 0
    # 更新结果字典
    result.update({
        "gpu_accuracy": round(gpu_accuracy, 6),
        "gpu_recall": round(gpu_recall, 6),
        "gpu_throughput": round(gpu_throughput, 6),
        "cpu_accuracy": round(cpu_accuracy, 6),
        "cpu_recall": round(cpu_recall, 6),
        "cpu_throughput": round(cpu_throughput, 6)
    })
    # 打印最终统计结果
    print("\n" + "="*50)
    print(f"总样本数: {total_samples}")
    print("\nGPU指标:")
    print(f"准确率: {result['gpu_accuracy']:.4f}%")
    print(f"召回率: {result['gpu_recall']:.4f}%")
    print(f"总运行时间: {result['gpu_running_time']:.6f}s")
    print(f"吞吐量: {result['gpu_throughput']:.2f}张/秒")
    print("\nCPU指标:")
    print(f"准确率: {result['cpu_accuracy']:.4f}%")
    print(f"召回率: {result['cpu_recall']:.4f}%")
    print(f"总运行时间: {result['cpu_running_time']:.6f}s")
    print(f"吞吐量: {result['cpu_throughput']:.2f}张/秒")
    print("="*50)
    #result = {}  
    #result['accuracy_1_1'] = 3
    result2file(result)
    if abs(gpu_accuracy - cpu_accuracy) > 3:
        log.error(f"gpu与cpu准确率差别超过3%，模型结果不正确")
        change_product_unavailable()
    """
    if result['accuracy_1_1'] < 0.9:
        log.error(f"1:1正确率未达到90%, 视为产品不可用")
        change_product_unavailable()
    if result['accuracy_1_N'] < 1:
        log.error(f"1:N正确率未达到100%, 视为产品不可用")
        change_product_unavailable()
    if result['1_1_latency'] > 0.5:
        log.error(f"1:1平均latency超过0.5s, 视为产品不可用")
        change_product_unavailable()
    if result['1_N_latency'] > 0.5:
        log.error(f"1:N平均latency超过0.5s, 视为产品不可用")
        change_product_unavailable()
    if result['enroll_latency'] > 1:
        log.error(f"enroll(入库)平均latency超过1s, 视为产品不可用")
        change_product_unavailable()
    """
    exit_code = 0
--- a/run_callback_cuda.py
+++ b/run_callback_cuda.py
--- a/run_callback_new.py
+++ b/run_callback_new.py
--- a/schemas/init.py
+++ b/schemas/init.py
--- a/schemas/context.py
+++ b/schemas/context.py
@@ -0,0 +1,90 @@
 import os
 from copy import deepcopy
 from typing import Dict, List, Optional
 from pydantic import BaseModel, Field
 from schemas.stream import StreamDataModel
 class LabelContext(BaseModel):
    start: float
    end: float
    answer: str
 class PredContext(BaseModel):
    recognition_results: StreamDataModel
    recv_time: Optional[float] = Field(None)
    send_time: Optional[float] = Field(None)
 class ASRContext:
    def __init__(self, **kwargs):
        self.bits = kwargs.get("bits", 16)
        self.channel = kwargs.get("channel", 1)
        self.sample_rate = kwargs.get("sample_rate", 16000)
        self.audio_format = kwargs.get("format", "wav")
        self.enable_words = kwargs.get("enable_words", True)
        self.char_contains_rate = kwargs.get("char_contains_rate", 0.8)
        self.lang = os.getenv("lang")
        if self.lang is None:
            self.lang = kwargs.get("lang", "en")
        self.stream = kwargs.get("stream", True)
        self.wait_time = float(os.getenv("wait_time", 0.1))
        self.chunk_size = self.sample_rate * self.bits / 8 * self.wait_time
        if int(os.getenv('chunk_size_set', 0)):
            self.chunk_size = int(os.getenv('chunk_size_set', 0))
        self.audio_length = 0
        self.file_path = ""
        self.labels: List[LabelContext] = kwargs.get("labels", [])
        self.preds: List[PredContext] = kwargs.get("preds", [])
        self.label_sentences: List[str] = []
        self.pred_sentences: List[str] = []
        self.send_time_start_end = []
        self.recv_time_start_end = []
        self.fail = False
        self.fail_char_contains_rate_num = 0
        self.punctuation_num = 0
        self.pred_punctuation_num = 0
    def append_labels(self, voices: List[Dict]):
        for voice_data in voices:
            label_context = LabelContext(**voice_data)
            self.labels.append(label_context)
    def append_preds(
        self,
        predict_data: List[StreamDataModel],
        send_time: List[float],
        recv_time: List[float],
    ):
        self.send_time_start_end = [send_time[0], send_time[-1]] if len(send_time) > 0 else []
        self.recv_time_start_end = [recv_time[0], recv_time[-1]] if len(recv_time) > 0 else []
        for pred_item, send_time_item, recv_time_item in zip(predict_data, send_time, recv_time):
            pred_item = deepcopy(pred_item)
            pred_context = PredContext(recognition_results=pred_item.model_dump())
            pred_context.send_time = send_time_item
            pred_context.recv_time = recv_time_item
            self.preds.append(pred_context)
    def to_dict(self):
        return {
            "bits": self.bits,
            "channel": self.channel,
            "sample_rate": self.sample_rate,
            "audio_format": self.audio_format,
            "enable_words": self.enable_words,
            "stream": self.stream,
            "wait_time": self.wait_time,
            "chunk_size": self.chunk_size,
            "labels": [item.model_dump_json() for item in self.labels],
            "preds": [item.model_dump_json() for item in self.preds],
        }
--- a/schemas/dataset.py
+++ b/schemas/dataset.py
@@ -0,0 +1,18 @@
 from typing import List
 from pydantic import BaseModel, Field
 class QueryDataSentence(BaseModel):
    answer: str = Field(description="文本label")
    start: float = Field(description="句子开始时间")
    end: float = Field(description="句子结束时间")
 class QueryData(BaseModel):
    lang: str = Field(description="语言")
    file: str = Field(description="音频文件位置")
    duration: float = Field(description="音频长度")
    voice: List[QueryDataSentence] = Field(
        description="音频文件的文本label内容"
    )
--- a/schemas/stream.py
+++ b/schemas/stream.py
@@ -0,0 +1,66 @@
 from typing import List
 from pydantic import BaseModel, ValidationError, field_validator
 from pydantic import model_validator
 class StreamWordsModel(BaseModel):
    text: str
    start_time: float
    end_time: float
    @model_validator(mode="after")
    def check_result(self):
        if self.end_time < self.start_time:
            raise ValidationError("end-time 小于 start-time, error")
        return self
 class StreamDataModel(BaseModel):
    text: str
    language: str
    final_result: bool
    para_seq: int
    start_time: float
    end_time: float
    words: List[StreamWordsModel]
    @model_validator(mode="after")
    def check_result(self):
        if self.end_time < self.start_time:
            raise ValidationError("end-time 小于 start-time, error")
        return self
 class StreamResultModel(BaseModel):
    asr_results: StreamDataModel
    @field_validator('asr_results', mode="after")
    def convert_to_seconds(cls, v: StreamDataModel, values):
        # 在这里处理除以1000的逻辑
        v.end_time = v.end_time / 1000
        v.start_time = v.start_time / 1000
        for word in v.words:
            word.start_time /= 1000
            word.end_time /= 1000
        return v
    class Config:
        validate_assignment = True
 class NonStreamDataModel(BaseModel):
    text: str
    para_seq: int
    start_time: float
    end_time: float
    @model_validator(mode="after")
    def check_result(self):
        if self.end_time < self.start_time:
            raise ValidationError("end-time 小于 start-time, error")
        return self
 class NonStreamResultModel(BaseModel):
    contents: List[NonStreamDataModel]
--- a/scripts/check_dataset_time.py
+++ b/scripts/check_dataset_time.py
@@ -0,0 +1,53 @@
 import os
 import sys
 from collections import defaultdict
 import yaml
 def main(dataset_dir):
    dirs = os.listdir(dataset_dir)
    dirs = list(
        filter(lambda x: os.path.isdir(os.path.join(dataset_dir, x)), dirs)
    )
    problem_dirs = set()
    problem_count = defaultdict(int)
    for dir in dirs:
        with open(os.path.join(dataset_dir, dir, "data.yaml"), "r") as f:
            data = yaml.full_load(f)
        for query_i, query in enumerate(data["query_data"]):
            voices = sorted(query["voice"], key=lambda x: x["start"])
            if voices != query["voice"]:
                print("-----", dir)
            if voices[0]["start"] > voices[0]["end"]:
                print(
                    "err1: %s 第%s个query的第%d个voice的start大于end: %s"
                    % (dir, query_i, 0, voices[0]["answer"])
                )
                problem_dirs.add(dir)
            for voice_i in range(1, len(voices)):
                voice = voices[voice_i]
                if voice["start"] > voice["end"]:
                    print(
                        "err1: %s 第%s个query的第%d个voice的start大于end: %s"
                        % (dir, query_i, voice_i, voice["answer"])
                    )
                    problem_dirs.add(dir)
                if voice["start"] < voices[voice_i - 1]["end"]:
                    print(
                        "err2: %s 第%s个query的第%d个voice的start小于前一个voice的end: %s"
                        % (dir, query_i, voice_i, voice["answer"])
                    )
                    problem_dirs.add(dir)
                    problem_count[dir] += 1
    print(len(dirs))
    print(problem_dirs)
    print(problem_count)
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("指定 测试数据集文件夹")
        sys.exit(1)
    main(sys.argv[1])
--- a/scripts/convert_callback_dataset.py
+++ b/scripts/convert_callback_dataset.py
@@ -0,0 +1,108 @@
 import json
 import os
 import shutil
 import sys
 import zipfile
 import yaml
 """
 target
 {
    "global": {
        "lang": ""
    },
    "query_data": [
        "file": "",
        "duration": 2.0,
        "voice": [
            {
                "answer": "",
                "start": 0.0,
                "end": 1.0
            }
        ]
    ]
 }
 """
 def situation_a(meta, dataset_folder, output_folder):
    """
    {
        "combined": {
            "en": [
                {
                    "wav": "*.wav",
                    "transcriptions": [
                        {
                            "text": "",
                            "start": 0.0,
                            "end": 1.0
                        }
                    ],
                    "duration": 2.0
                }
            ]
        }
    }
    """
    meta = meta["combined"]
    for lang, arr in meta.items():
        print("processing", lang)
        assert len(lang) == 2
        lang_folder = os.path.join(output_folder, lang)
        os.makedirs(lang_folder, exist_ok=True)
        data = {"global": {"lang": lang}, "query_data": []}
        query_data = data["query_data"]
        for item in arr:
            os.makedirs(
                os.path.join(lang_folder, os.path.dirname(item["wav"])),
                exist_ok=True,
            )
            mp3_file = item["wav"][:-4] + ".mp3"
            shutil.copyfile(
                os.path.join(dataset_folder, mp3_file),
                os.path.join(lang_folder, mp3_file),
            )
            query_data_item = {
                "file": mp3_file,
                "duration": float(item["duration"]),
                "voice": [],
            }
            query_data.append(query_data_item)
            voice = query_data_item["voice"]
            for v in item["transcriptions"]:
                voice.append(
                    {
                        "answer": v["text"],
                        "start": float(v["start"]),
                        "end": float(v["end"]),
                    }
                )
        with open(os.path.join(lang_folder, "data.yaml"), "w") as f:
            yaml.dump(data, f, indent=2, allow_unicode=True, encoding="utf-8")
        with zipfile.ZipFile(
            os.path.join(output_folder, lang + ".zip"), "w"
        ) as ziper:
            dirname = lang_folder
            for path, _, files in os.walk(dirname):
                for file in files:
                    ziper.write(
                        os.path.join(path, file),
                        os.path.join(path[len(dirname) :], file),
                        zipfile.ZIP_DEFLATED,
                    )
 if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("指定 数据集文件夹路径 输出路径")
        sys.exit(1)
    dataset_folder = sys.argv[1]
    output_folder = sys.argv[2]
    with open(os.path.join(dataset_folder, "meta.json")) as f:
        meta = json.load(f)
    situation_a(meta, dataset_folder, output_folder)
--- a/scripts/debug_detailcase.py
+++ b/scripts/debug_detailcase.py
@@ -0,0 +1,56 @@
 import json
 import sys
 from schemas.dataset import QueryData
 from schemas.stream import StreamDataModel
 from utils.evaluator_plus import evaluate_editops
 def main(detailcase_file: str):
    with open(detailcase_file) as f:
        d = json.load(f)[0]
    preds = d["preds"]
    preds = list(map(lambda x: StreamDataModel(**x), preds))
    preds = list(filter(lambda x: x.final_result, preds))
    label = d["label"]
    label = QueryData(**label)
    print(evaluate_editops(label, preds))
 def evaluate_from_record(detailcase_file: str, record_path: str):
    with open(detailcase_file) as f:
        d = json.load(f)[0]
    label = d["label"]
    label = QueryData(**label)
    with open(record_path) as f:
        record = json.load(f)
    tokens_pred = record["tokens_pred"]
    tokens_label = record["tokens_label"]
    recognition_results = record["recognition_results"]
    recognition_results = list(
        map(lambda x: StreamDataModel(**x), recognition_results)
    )
    a, b = [], []
    for i, rr in enumerate(recognition_results):
        if rr.final_result:
            a.append(tokens_pred[i])
            b.append(rr)
    tokens_pred = a
    recognition_results = b
    print(
        evaluate_editops(
            label,
            recognition_results,
            tokens_pred,
            tokens_label,
        )
    )
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("请指定 detailcase 文件路径")
        sys.exit(1)
    main(sys.argv[1])
    # evaluate_from_record(sys.argv[1], sys.argv[2])
--- a/BIN
+++ b/BIN
--- a/starting_kit/Dockerfile
+++ b/starting_kit/Dockerfile
@@ -0,0 +1,11 @@
 FROM harbor.4pd.io/inf/base-python3.8-ubuntu:1.1.0
 WORKDIR /workspace
 ADD ./requirements.txt /workspace
 RUN pip install -r ./requirements.txt -i https://nexus.4pd.io/repository/pypi-all/simple --trusted-host nexus.4pd.io --extra-index-url https://mirrors.aliyun.com/pypi/simple/ \
    && pip cache purge
 ADD . /workspace
 CMD ["python", "main.py"]
--- a/starting_kit/main.py
+++ b/starting_kit/main.py
@@ -0,0 +1,313 @@
 import logging
 import os
 import threading
 import time
 from typing import Optional
 import flask
 import requests
 from werkzeug.datastructures import FileStorage
 app = flask.Flask(__name__)
 heartbeat_active = False
 log = logging.getLogger(__name__)
 log.propagate = False
 level = logging.INFO
 log.setLevel(level)
 formatter = logging.Formatter(
    "[%(asctime)s] %(levelname)s : %(pathname)s:%(lineno)d - %(message)s",
    "%Y-%m-%d %H:%M:%S",
 )
 streamHandler = logging.StreamHandler()
 streamHandler.setLevel(level)
 streamHandler.setFormatter(formatter)
 log.addHandler(streamHandler)
 def heartbeat(url):
    global heartbeat_active
    if heartbeat_active:
        return
    heartbeat_active = True
    while True:
        try:
            requests.post(url, json={"status": "RUNNING"})
        except Exception:
            pass
        time.sleep(10)
 def asr(
    audio_file: FileStorage,
    language: Optional[str],
    progressCallbackUrl: str,
    taskId: str,
 ):
    """TODO: 读取audio_file, 调用语音识别服务, 实时返回识别结果"""
    # ignore BEGIN
    # 此处为榜单本地测试使用
    if os.getenv("LOCAL_TEST"):
        return local_test(progressCallbackUrl, taskId)
    # ignore END
    language = "de"
    # 某一次识别返回
    requests.post(
        progressCallbackUrl,
        json={
            "taskId": taskId,
            "status": "RUNNING",
            "recognition_results": {  # 传增量结果, status如果是FINISHED, 或者ERROR, 这个字段请不要传值
                "text": "最先启动的还是",
                "final_result": True,
                "para_seq": 0,
                "language": language,
                "start_time": 6300,
                "end_time": 6421,
                "words": [
                    {
                        "text": "最",
                        "start_time": 6300,
                        "end_time": 6321,
                    },
                    {
                        "text": "先",
                        "start_time": 6321,
                        "end_time": 6345,
                    },
                    {
                        "text": "启",
                        "start_time": 6345,
                        "end_time": 6350,
                    },
                    {
                        "text": "动",
                        "start_time": 6350,
                        "end_time": 6370,
                    },
                    {
                        "text": "的",
                        "start_time": 6370,
                        "end_time": 6386,
                    },
                    {
                        "text": "还",
                        "start_time": 6386,
                        "end_time": 6421,
                    },
                    {
                        "text": "是",
                        "start_time": 6421,
                        "end_time": 6435,
                    },
                ],
            },
        },
    )
    # ... 识别结果返回完毕
    # 识别结束
    requests.post(
        progressCallbackUrl,
        json={
            "taskId": taskId,
            "status": "FINISHED",
        },
    )
@app.post("/predict")
 def predict():
    body = flask.request.form
    language = body.get("language")
    if language is None:
        "自行判断语种"
    taskId = body["taskId"]
    progressCallbackUrl = body["progressCallbackUrl"]
    heartbeatUrl = body["heartbeatUrl"]
    threading.Thread(
        target=heartbeat, args=(heartbeatUrl,), daemon=True
    ).start()
    audio_file = flask.request.files["file"]
    # audio_file.stream  # 读取文件流
    # audio_file.save("audio.mp3")  # 保存文件
    threading.Thread(
        target=asr,
        args=(audio_file, language, progressCallbackUrl, taskId),
        daemon=True,
    ).start()
    return flask.jsonify({"status": "OK"})
 # ignore BEGIN
 def local_test(progressCallbackUrl: str, taskId: str):
    """忽略此方法, 此方法为榜单本地调试使用"""
    import random
    import re
    import yaml
    def callback(content):
        try:
            if content is None:
                requests.post(
                    progressCallbackUrl,
                    json={"taskId": taskId, "status": "FINISHED"},
                )
            else:
                requests.post(
                    progressCallbackUrl,
                    json={
                        "taskId": taskId,
                        "status": "RUNNING",
                        "recognition_results": content,
                    },
                )
        except Exception:
            pass
    with open(
        os.getenv("LOCAL_TEST_DATA_PATH", "../dataset/out/data.yaml")
    ) as f:
        data = yaml.full_load(f)
    voices = data["query_data"][0]["voice"]
    # 首次发送
    first_send_time = random.randint(3, 5)
    send_interval = random.random() * 0
    log.info("首次发送%ss 发送间隔%ss" % (first_send_time, send_interval))
    time.sleep(first_send_time)
    # 将句子拼接到一起
    if random.random() < 0.3:
        log.info("将部分句子合并成单句 每次合并的句子不超过3句")
        rand_idx = 0
        rand_sep = [0, len(voices) - 1]
        while rand_sep[rand_idx] + 1 <= rand_sep[rand_idx + 1] - 1:
            rand_cursep = random.randint(
                rand_sep[rand_idx] + 1,
                min(rand_sep[rand_idx + 1] - 1, rand_sep[rand_idx] + 1 + 3),
            )
            rand_sep.insert(rand_idx + 1, rand_cursep)
            rand_idx += 1
        merged_voices = []
        for i, cur_sep in enumerate(rand_sep[:-1]):
            voice = voices[cur_sep]
            for j in range(cur_sep + 1, rand_sep[i + 1]):
                voice["answer"] += voices[j]["answer"]
                voice["end"] = voices[j]["end"]
            merged_voices.append(voice)
        merged_voices.append(voices[rand_sep[-1]])
        voices = merged_voices
    def split_and_keep(text, delimiters):
        # 构建正则表达式模式，匹配文本或分隔符
        pattern = "|".join(re.escape(delimiter) for delimiter in delimiters)
        pattern = f"(?:[^{pattern}]+|[{pattern}])"
        return re.findall(pattern, text)
    puncs = [",", ".", "?", "!", ";", ":"]
    para_seq = 0
    for voice in voices:
        answer: str = voice["answer"]
        start_time: float = voice["start"]
        end_time: float = voice["end"]
        words = split_and_keep(answer, puncs)
        temp_words = []
        for i, word in enumerate(words):
            if i > 0 and i < len(words) - 1 and random.random() < 0.15:
                log.info("随机删除word")
                continue
            temp_words.extend(word.split(" "))
        if len(temp_words) == 0:
            temp_words = words[0].split(" ")
        words = temp_words
        answer = " ".join(words)
        words = list(map(lambda x: x.strip(), words))
        words = list(filter(lambda x: len(x) > 0, words))
        # 将时间均匀分配到每个字上
        words_withtime = []
        word_unittime = (end_time - start_time) / len(words)
        for i, word in enumerate(words):
            word_start = start_time + word_unittime * i
            word_end = word_start + word_unittime
            words_withtime.append(
                {
                    "text": word,
                    "start_time": word_start * 1000,
                    "end_time": word_end * 1000,
                }
            )
        # 将句子首尾的标点符号时间扩展到字上 标点符号时间为瞬间
        punc_at = 0
        while punc_at < len(words) and words[punc_at] in puncs:
            punc_at += 1
        if punc_at < len(words):
            words_withtime[punc_at]["start_time"] = words_withtime[0][
                "start_time"
            ]
        for i in range(0, punc_at):
            words_withtime[i]["start_time"] = words_withtime[0]["start_time"]
            words_withtime[i]["end_time"] = words_withtime[0]["start_time"]
        punc_at = len(words) - 1
        while punc_at >= 0 and words[punc_at] in puncs:
            punc_at -= 1
        if punc_at >= 0:
            words_withtime[punc_at]["end_time"] = words_withtime[-1]["end_time"]
        for i in range(punc_at + 1, len(words)):
            words_withtime[i]["start_time"] = (
                words_withtime[-1]["end_time"] + 0.1
            )
            words_withtime[i]["end_time"] = words_withtime[-1]["end_time"] + 0.1
        if random.random() < 0.4 and len(words_withtime) > 1:
            log.info("发送一次final_result=False")
            rand_idx = random.randint(1, len(words_withtime) - 1)
            recognition_result = {
                "text": " ".join(
                    map(lambda x: x["text"], words_withtime[:rand_idx])
                ),
                "final_result": False,
                "para_seq": para_seq,
                "language": "de",
                "start_time": start_time * 1000,
                "end_time": end_time * 1000,
                "words": words_withtime[:rand_idx],
            }
            callback(recognition_result)
        recognition_result = {
            "text": answer,
            "final_result": True,
            "para_seq": para_seq,
            "language": "de",
            "start_time": start_time * 1000,
            "end_time": end_time * 1000,
            "words": words_withtime,
        }
        callback(recognition_result)
        para_seq += 1
        log.info("send %s" % para_seq)
        time.sleep(send_interval)
    callback(None)
 # ignore END
 if __name__ == "__main__":
    app.run(host="0.0.0.0", port=80)
--- a/starting_kit/requirements.txt
+++ b/starting_kit/requirements.txt
@@ -0,0 +1,3 @@
 flask
 requests
 pyyaml
--- a/tests/test_callback_editops.py
+++ b/tests/test_callback_editops.py
@@ -0,0 +1,16 @@
 import json
 from schemas.dataset import QueryData
 from schemas.stream import StreamDataModel
 from utils.evaluator_plus import evaluate_editops
 with open("out/detail_cases.json") as f:
    detail_cases = json.load(f)
 detail_case = detail_cases[0]
 preds = []
 for pred in detail_case["preds"]:
    preds.append(StreamDataModel.model_validate(pred))
 label = QueryData.model_validate(detail_case["label"])
 print(evaluate_editops(label, preds))
--- a/tests/test_cer.py
+++ b/tests/test_cer.py
@@ -0,0 +1,93 @@
 """
 f(a, b) 计算 a -> b 的编辑距离，使用的方法是之前asr榜单的方法
 g(a, b) 计算 a -> b 的编辑距离，使用的是原始的编辑距离计算方法
 test() 是对拍程序
 """
 import random
 import string
 from copy import deepcopy
 from typing import List, Tuple
 import Levenshtein
 def mapping(gt: str, dt: str):
    return [i for i in gt], [i for i in dt]
 def token_mapping(
    tokens_gt: List[str], tokens_dt: List[str]
 ) -> Tuple[List[str], List[str]]:
    arr1 = deepcopy(tokens_gt)
    arr2 = deepcopy(tokens_dt)
    operations = Levenshtein.editops(arr1, arr2)
    for op in operations[::-1]:
        if op[0] == "insert":
            arr1.insert(op[1], None)
        elif op[0] == "delete":
            arr2.insert(op[2], None)
    return arr1, arr2
 def cer(tokens_gt_mapping: List[str], tokens_dt_mapping: List[str]):
    """输入的是经过编辑距离映射后的两个 token 序列，返回 1-cer, token-cnt"""
    insert = sum(1 for item in tokens_gt_mapping if item is None)
    delete = sum(1 for item in tokens_dt_mapping if item is None)
    equal = sum(
        1
        for token_gt, token_dt in zip(tokens_gt_mapping, tokens_dt_mapping)
        if token_gt == token_dt
    )
    replace = len(tokens_gt_mapping) - insert - equal  # - delete
    return replace, delete, insert
 def f(a, b):
    return cer(*token_mapping(*mapping(a, b)))
 def raw(tokens_gt, tokens_dt):
    arr1 = deepcopy(tokens_gt)
    arr2 = deepcopy(tokens_dt)
    operations = Levenshtein.editops(arr1, arr2)
    insert = 0
    delete = 0
    replace = 0
    for op in operations:
        if op[0] == "insert":
            insert += 1
        if op[0] == "delete":
            delete += 1
        if op[0] == "replace":
            replace += 1
    return replace, delete, insert
 def g(a, b):
    return raw(*mapping(a, b))
 def check(a, b):
    ff = f(a, b)
    gg = g(a, b)
    if ff != gg:
        print(ff, gg)
    return ff == gg
 def random_string(length):
    letters = string.ascii_lowercase
    return "".join(random.choice(letters) for i in range(length))
 def test():
    for _ in range(10000):
        a = random_string(30)
        b = random_string(30)
        if not check(a, b):
            print(a, b)
            break
 test()
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/asr_ter.py
+++ b/utils/asr_ter.py
@@ -0,0 +1,57 @@
 # copy from
 # https://gitlab.4pd.io/scene_lab/leaderboard/judge_flows/foundamental_capability/blob/master/utils/asr_ter.py
 def calc_ter_speechio(pred, ref, language="zh"):
    assert language == "zh", "Unsupported language %s" % language
    assert ref is not None and ref != "", "Reference script cannot be empty"
    if language == "zh":
        from .speechio import error_rate_zh as error_rate
        from .speechio import textnorm_zh as textnorm
        normalizer = textnorm.TextNorm(
            to_banjiao=True,
            to_upper=True,
            to_lower=False,
            remove_fillers=True,
            remove_erhua=True,
            check_chars=False,
            remove_space=False,
            cc_mode="",
        )
        norm_pred = normalizer(pred if pred is not None else "")
        norm_ref = normalizer(ref)
        tokenizer = "char"
        alignment, score = error_rate.EditDistance(
            error_rate.tokenize_text(norm_ref, tokenizer),
            error_rate.tokenize_text(norm_pred, tokenizer),
        )
        c, s, i, d = error_rate.CountEdits(alignment)
        ter = error_rate.ComputeTokenErrorRate(c, s, i, d) / 100.0
        return {"ter": ter, "err_token_cnt": s + d + i, "ref_all_token_cnt": s + d + c}
    assert False, "Bug, not reachable"
 def calc_ter_wjs(pred, ref, language="zh"):
    assert language == "zh", "Unsupported language %s" % language
    assert ref is not None and ref != "", "Reference script cannot be empty"
    from . import wjs_asr_wer
    ignore_words = set()
    case_sensitive = False
    split = None
    calculator = wjs_asr_wer.Calculator()
    norm_pred = wjs_asr_wer.normalize(
        wjs_asr_wer.characterize(pred if pred is not None else ""),
        ignore_words,
        case_sensitive,
        split,
    )
    norm_ref = wjs_asr_wer.normalize(wjs_asr_wer.characterize(ref), ignore_words, case_sensitive, split)
    result = calculator.calculate(norm_pred, norm_ref)
    ter = ((result["ins"] + result["sub"] + result["del"]) * 1.0 / result["all"]) if result["all"] != 0 else 1.0
    return {
        "ter": ter,
        "err_token_cnt": result["ins"] + result["sub"] + result["del"],
        "ref_all_token_cnt": result["all"],
    }
--- a/utils/client.py
+++ b/utils/client.py
@@ -0,0 +1,224 @@
 import json
 import os
 import threading
 import time
 import traceback
 from copy import deepcopy
 from typing import Any, List
 import websocket
 from pydantic_core import ValidationError
 from websocket import create_connection
 from schemas.context import ASRContext
 from schemas.stream import StreamDataModel, StreamResultModel
 from utils.logger import logger
 IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", None) is None
 class Client:
    def __init__(self, sut_url: str, context: ASRContext) -> None:
        # base_url = "ws://127.0.0.1:5003"
        self.base_url = sut_url + "/recognition"
        logger.info(f"{self.base_url}")
        self.context: ASRContext = deepcopy(context)
        # if not os.getenv("DATASET_FILEPATH", ""):
        # self.base_url = "wss://speech.4paradigm.com/aibuds/api/v1/recognition"
        # self.base_url = "ws://localhost:5003/recognition"
        self.connect_num = 0
        self.exception = False
        self.close_time = 10**50
        self.send_time: List[float] = []
        self.recv_time: List[float] = []
        self.predict_data: List[Any] = []
        self.success = True
    def action(self):
        # 如果 5 次初始化都失败，则退出
        connect_success = False
        for i in range(5):
            try:
                self._connect_init()
                connect_success = True
                break
            except Exception as e:
                logger.error(f"第 {i+1} 次连接失败，原因：{e}")
                time.sleep(int(os.getenv("connect_sleep", 10)))
        if not connect_success:
            exit(-1)
        self.trecv = threading.Thread(target=self._recv)
        self.trecv.start()
        self._send()
        self._close()
        return self._gen_result()
    def _connect_init(self):
        end_time = time.time() + float(os.getenv("end_time", 2))
        success = False
        try:
            self.ws = create_connection(self.base_url)
            self.ws.send(json.dumps(self._gen_init_data()))
            while time.time() < end_time and not success:
                data = self.ws.recv()
                logger.info(f"data {data}")
                if len(data) == 0:
                    time.sleep(1)
                    continue
                if isinstance(data, str):
                    try:
                        data = json.loads(data)
                    except Exception:
                        raise Exception("初始化阶段，数据不是 json 字符串格式，终止流程")
                if isinstance(data, dict):
                    success = data.get("success", False)
                    if not success:
                        logger.error(f"初始化失败，返回的结果为 {data}，终止流程")
                    else:
                        break
                logger.error("初始化阶段，数据不是 json 字符串格式，终止流程")
                exit(-1)
        except websocket.WebSocketConnectionClosedException or TimeoutError:
            raise Exception("初始化阶段连接中断，终止流程")
            # exit(-1)
        except ConnectionRefusedError:
            raise Exception("初始化阶段，连接失败，等待 10s 后重试，最多重试 5 次")
            # logger.error("初始化阶段，连接失败，等待 10s 后重试，最多重试 5 次")
            # self.connect_num += 1
            # if self.connect_num <= 4:
            #     time.sleep(int(os.getenv("connect_sleep", 10)))
            #     self._connect_init()
            #     success = True
            # else:
            #     logger.error("初始化阶段连接失败多次")
            #     exit(-1)
        if not success:
            # logger.error("初始化阶段 60s 没有返回数据，时间太长，终止流程")
            raise Exception("初始化阶段 60s 没有返回数据，时间太长，终止流程")
        else:
            logger.info("建立连接成功")
        self.connect_num = 0
    def _send(self):
        send_ts = float(os.getenv("send_interval", 60))
        if not self.success:
            return
        with open(self.context.file_path, "rb") as fp:
            wav_data = fp.read()
            meta_length = wav_data.index(b"data") + 8
        try:
            with open(self.context.file_path, "rb") as fp:
                # 去掉 wav 文件的头信息
                fp.read(meta_length)
                # 上一段音频的发送时间
                last_send_time = -1
                # 正文内容
                while True:
                    now_time = time.perf_counter()
                    if last_send_time == -1:
                        chunk = fp.read(int(self.context.chunk_size))
                    else:
                        interval_cnt = max(
                            int((now_time - last_send_time) / self.context.wait_time),
                            1,
                        )
                        chunk = fp.read(int(self.context.chunk_size * interval_cnt))
                    if not chunk:
                        break
                    send_time_start = time.perf_counter()
                    self.ws.send(chunk, websocket.ABNF.OPCODE_BINARY)
                    self.send_time.append(send_time_start)
                    last_send_time = send_time_start
                    send_time_end = time.perf_counter()
                    if send_time_end - send_time_start > send_ts:
                        logger.error(f"发送延迟已经超过 {send_ts}s, 终止当前音频发送")
                        break
                    if (sleep_time := self.context.wait_time + now_time - send_time_end) > 0:
                        time.sleep(sleep_time)
            logger.info("当条语音数据发送完成")
            self.ws.send(json.dumps({"end": True}))
            logger.info("2s 后关闭双向连接.")
        except BrokenPipeError:
            logger.error("发送数据出错，被测服务出现故障")
        except Exception as e:
            logger.error(f"Exception: {e}")
            logger.error(f"{traceback.print_exc()}")
            logger.error("发送数据失败")
            self.success = False
        # self.close_time = time.perf_counter() + int(os.getenv("api_timeout", 2))
        self.close_time = time.perf_counter() + 20 * 60
    def _recv(self):
        try:
            while self.ws.connected and self.success:
                recv_data = self.ws.recv()
                if isinstance(recv_data, str):
                    if recv_data := str(recv_data):
                        self.recv_time.append(time.perf_counter())
                        # 识别到最后的合并结果后再关闭
                        recognition_results = StreamResultModel(**json.loads(recv_data)).recognition_results
                        if (
                            recognition_results.final_result
                            and recognition_results.start_time == 0
                            and recognition_results.end_time == 0
                            and recognition_results.para_seq == 0
                        ):
                            self.success = False
                        else:
                            self.predict_data.append(recv_data)
                        # if recv_data.recognition_results.final_result and (IN_TEST or os.getenv('test')):
                        #     logger.info(f"recv_data {recv_data}")
                else:
                    self.success = False
                    raise Exception("返回的结果不是字符串形式")
        except websocket.WebSocketConnectionClosedException:
            logger.error("WebSocketConnectionClosedException")
        except ValidationError as e:
            logger.error("返回的结果不符合格式")
            logger.error(f"Exception is {e}")
            os._exit(1)
        except OSError:
            pass
        except Exception:
            logger.error(f"{traceback.print_exc()}")
            logger.error("处理被测服务返回数据时出错")
            self.success = False
    def _close(self):
        while time.perf_counter() < self.close_time and self.success:
            # while not self.success:
            time.sleep(1)
        try:
            self.ws.close()
        except Exception as e:
            print(e)
            pass
    def _gen_result(self) -> dict:
        if not self.predict_data:
            logger.error("没有任何数据返回")
        self.predict_data = [StreamResultModel(**json.loads(data)).recognition_results for data in self.predict_data]
        # for item in self.predict_data:
        #     if item.final_result and (IN_TEST or os.getenv('test')):
        #         logger.info(f"recv_data {item}")
        return {
            "fail": not self.predict_data,
            "send_time": self.send_time,
            "recv_time": self.recv_time,
            "predict_data": self.predict_data,
        }
    def _gen_init_data(self) -> dict:
        return {
            "parameter": {
                "lang": self.context.lang,
                "sample_rate": self.context.sample_rate,
                "channel": self.context.channel,
                "format": self.context.audio_format,
                "bits": self.context.bits,
                "enable_words": self.context.enable_words,
            }
        }
--- a/utils/client_async.py
+++ b/utils/client_async.py
@@ -0,0 +1,277 @@
 import asyncio
 import json
 import os
 import time
 import traceback
 from copy import deepcopy
 from enum import Enum
 from typing import Any, List
 import websockets
 from pydantic_core import ValidationError
 from schemas.context import ASRContext
 from schemas.stream import StreamResultModel, StreamWordsModel
 from utils.logger import logger
 IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", None) is None
 class STATUS_DATA(str, Enum):
    WAITING_FIRST_INIT = "waiting_first_init"
    FIRST_FAIL = "fail"
    WAITING_SECOND_INIT = "waiting_second_init"
    SECOND_INIT = "second_fail"
    WAITING_THIRD_INIT = "waiting_third_init"
    THIRD_INIT = "third_fail"
    SUCCESS = "success"
    CLOSED = "closed"
 class ClientAsync:
    def __init__(self, sut_url: str, context: ASRContext, idx: int) -> None:
        # base_url = "ws://127.0.0.1:5003"
        self.base_url = sut_url + "/recognition"
        self.context: ASRContext = deepcopy(context)
        self.idx = idx
        # if not os.getenv("DATASET_FILEPATH", ""):
        # self.base_url = "wss://speech.4paradigm.com/aibuds/api/v1/recognition"
        # self.base_url = "ws://localhost:5003/recognition"
        self.fail_count = 0
        self.close_time = 10**50
        self.send_time: List[float] = []
        self.recv_time: List[float] = []
        self.predict_data: List[Any] = []
    async def _sender(
        self, websocket: websockets.WebSocketClientProtocol, send_queue: asyncio.Queue, recv_queue: asyncio.Queue
    ):
        # 设置 websocket 缓冲区大小
        websocket.transport.set_write_buffer_limits(1024 * 1024 * 1024)
        # 发送初始化数据
        await websocket.send(json.dumps(self._gen_init_data()))
        await send_queue.put(STATUS_DATA.WAITING_FIRST_INIT)
        connect_status = await recv_queue.get()
        if connect_status == STATUS_DATA.FIRST_FAIL:
            return
        # 开始发送音频
        with open(self.context.file_path, "rb") as fp:
            wav_data = fp.read()
            meta_length = wav_data.index(b"data") + 8
        try:
            with open(self.context.file_path, "rb") as fp:
                # 去掉 wav 文件的头信息
                fp.read(meta_length)
                wav_time = 0.0
                label_id = 0
                char_contains_rate_checktime = []
                char_contains_rate_checktime_id = 0
                while True:
                    now_time = time.perf_counter()
                    chunk = fp.read(int(self.context.chunk_size))
                    if not chunk:
                        break
                    wav_time += self.context.wait_time
                    try:
                        self.send_time.append(time.perf_counter())
                        await asyncio.wait_for(websocket.send(chunk), timeout=0.08)
                    except asyncio.exceptions.TimeoutError:
                        pass
                    while label_id < len(self.context.labels) and wav_time >= self.context.labels[label_id].start:
                        char_contains_rate_checktime.append(now_time + 3.0)
                        label_id += 1
                    predict_text_len = sum(map(lambda x: len(x.text), self.predict_data))
                    while char_contains_rate_checktime_id < len(char_contains_rate_checktime) and \
                            char_contains_rate_checktime[char_contains_rate_checktime_id] <= now_time:
                        label_text_len = sum(
                            map(lambda x: len(x.answer),
                                self.context.labels[:char_contains_rate_checktime_id+1]))
                        if predict_text_len / self.context.char_contains_rate < label_text_len:
                            self.context.fail_char_contains_rate_num += 1
                        char_contains_rate_checktime_id += 1
                    await asyncio.sleep(max(0, self.context.wait_time - (time.perf_counter() - now_time)))
            await websocket.send(json.dumps({"end": True}))
            logger.info(f"第 {self.idx} 条数据，当条语音数据发送完成")
            logger.info(f"第 {self.idx} 条数据，3s 后关闭双向连接.")
            self.close_time = time.perf_counter() + 3
        except websockets.exceptions.ConnectionClosedError:
            logger.error(f"第 {self.idx} 条数据发送过程中，连接断开")
        except Exception:
            logger.error(f"{traceback.print_exc()}")
            logger.error(f"第 {self.idx} 条数据，发送数据失败")
    async def _recv(
        self, websocket: websockets.WebSocketClientProtocol, send_queue: asyncio.Queue, recv_queue: asyncio.Queue
    ):
        await recv_queue.get()
        try:
            await asyncio.wait_for(websocket.recv(), timeout=2)
        except asyncio.exceptions.TimeoutError:
            await send_queue.put(STATUS_DATA.FIRST_FAIL)
            logger.info(f"第 {self.idx} 条数据，初始化阶段, 2s 没收到 success 返回，超时了")
            self.fail_count += 1
            return
        except Exception as e:
            await send_queue.put(STATUS_DATA.FIRST_FAIL)
            logger.error(f"第 {self.idx} 条数据，初始化阶段, 收到异常：{e}")
            self.fail_count += 1
            return
        else:
            await send_queue.put(STATUS_DATA.SUCCESS)
        # 开始接收语音识别结果
        try:
            while websocket.open:
                # 接收数据
                recv_data = await websocket.recv()
                if isinstance(recv_data, str):
                    self.recv_time.append(time.perf_counter())
                    recv_data = str(recv_data)
                    recv_data = json.loads(recv_data)
                    result = StreamResultModel(**recv_data)
                    recognition_results = result.asr_results
                    if (
                        recognition_results.final_result
                        and not recognition_results.language
                        and recognition_results.start_time == 0
                        and recognition_results.end_time == 0
                        and recognition_results.para_seq == 0
                    ):
                        pass
                    else:
                        self.predict_data.append(recognition_results)
                else:
                    raise Exception("返回的结果不是字符串形式")
        except websockets.exceptions.ConnectionClosedOK:
            pass
        except websockets.exceptions.ConnectionClosedError:
            pass
        except ValidationError as e:
            logger.error(f"第 {self.idx} 条数据，返回的结果不符合格式")
            logger.error(f"Exception is {e}")
            os._exit(1)
        except OSError:
            pass
        except Exception:
            logger.error(f"{traceback.print_exc()}")
            logger.error(f"第 {self.idx} 条数据，处理被测服务返回数据时出错")
    async def _action(self):
        logger.info(f"第 {self.idx} 条数据开始测试")
        while self.fail_count < 3:
            send_queue = asyncio.Queue()
            recv_queue = asyncio.Queue()
            self.send_time: List[float] = []
            self.recv_time: List[float] = []
            self.predict_data: List[Any] = []
            async with websockets.connect(self.base_url) as websocket:
                send_task = asyncio.create_task(self._sender(websocket, send_queue, recv_queue))
                recv_task = asyncio.create_task(self._recv(websocket, recv_queue, send_queue))
                await asyncio.gather(send_task)
                await asyncio.sleep(3)
            await asyncio.gather(recv_task)
            if self.send_time:
                break
            else:
                self.fail_count += 1
                logger.info(f"第 {self.idx} 条数据，初始化阶段, 第 {self.fail_count} 次失败, 1s 后重试")
                time.sleep(1)
    def action(self):
        asyncio.run(self._action())
        return self._gen_result()
    def _gen_result(self) -> ASRContext:
        if not self.predict_data:
            logger.error(f"第 {self.idx} 条数据，没有任何数据返回")
        self.context.append_preds(self.predict_data, self.send_time, self.recv_time)
        self.context.fail = not self.predict_data
        punctuation_words: List[StreamWordsModel] = []
        for pred in self.predict_data:
            punctuations = [",", ".", "!", "?"]
            if pred.language == "zh":
                punctuations = ["，", "。", "！", "？"]
            elif pred.language == "ja":
                punctuations = ["、", "。", "！", "？"]
            elif pred.language in ("ar", "fa"):
                punctuations = ["،", ".", "!", "؟"]
            elif pred.language == "el":
                punctuations = [",", ".", "！", "；"]
            elif pred.language == "ti":
                punctuations = ["།"]
            for word in pred.words:
                if word.text in punctuations:
                    punctuation_words.append(word)
        start_times = list(map(lambda x: x.start_time, punctuation_words))
        start_times = sorted(start_times)
        end_times = list(map(lambda x: x.end_time, punctuation_words))
        end_times = sorted(end_times)
        self.context.punctuation_num = len(self.context.labels)
        label_n = len(self.context.labels)
        for i, label in enumerate(self.context.labels):
            label_left = (label.end - 0.7)
            label_right = (label.end + 0.7)
            if i < label_n - 1:
                label_left = label.end
                label_right = self.context.labels[i+1].start
            exist = False
            def upper_bound(x: float, lst: List[float]) -> int:
                ans = -1
                left, right = 0, len(lst) - 1
                while left <= right:
                    mid = (left + right) // 2
                    if lst[mid] >= x:
                        ans = mid
                        right = mid - 1
                    else:
                        left = mid + 1
                return ans
            def lower_bound(x: float, lst: List[float]) -> int:
                ans = -1
                left, right = 0, len(lst) - 1
                while left <= right:
                    mid = (left + right) // 2
                    if lst[mid] <= x:
                        ans = mid
                        left = mid + 1
                    else:
                        right = mid - 1
                return ans
            left_in_pred = upper_bound(label_left, start_times)
            if left_in_pred != -1 and start_times[left_in_pred] <= label_right:
                exist = True
            right_in_pred = lower_bound(label_right, end_times)
            if right_in_pred != -1 and end_times[right_in_pred] >= label_left:
                exist = True
            if exist:
                self.context.pred_punctuation_num += 1
        return self.context
    def _gen_init_data(self) -> dict:
        return {
            "parameter": {
                "lang": None,
                "sample_rate": self.context.sample_rate,
                "channel": self.context.channel,
                "format": self.context.audio_format,
                "bits": self.context.bits,
                "enable_words": self.context.enable_words,
            }
        }
--- a/utils/client_callback.py
+++ b/utils/client_callback.py
@@ -0,0 +1,409 @@
 import logging
 import os
 import threading
 import time
 from typing import Dict, List, Optional
 import requests
 from flask import Flask, abort, request
 from pydantic import BaseModel, Field, ValidationError, field_validator
 from schemas.dataset import QueryData
 from schemas.stream import StreamDataModel
 from utils.evaluator_plus import evaluate_editops, evaluate_punctuation
 from .logger import log
 MY_POD_IP = os.environ["MY_POD_IP"]
 class StopException(Exception): ...
 class EvaluateResult(BaseModel):
    lang: str
    cer: float
    align_start: Dict[int, int] = Field(
        description="句首字对齐时间差值(ms) -> 对齐数"
    )
    align_end: Dict[int, int] = Field(
        description="句尾字对齐时间差值(ms) -> 对齐数"
    )
    first_word_distance_sum: float = Field(description="句首字距离总和(s)")
    last_word_distance_sum: float = Field(description="句尾字距离总和(s)")
    rtf: float = Field(description="翻译速度")
    first_receive_delay: float = Field(description="首包接收延迟(s)")
    query_count: int = Field(description="音频数")
    voice_count: int = Field(description="句子数")
    pred_punctuation_num: int = Field(description="预测标点数")
    label_punctuation_num: int = Field(description="标注标点数")
    pred_sentence_punctuation_num: int = Field(description="预测句子标点数")
    label_setence_punctuation_num: int = Field(description="标注句子标点数")
    preds: List[StreamDataModel] = Field(description="预测结果")
    label: QueryData = Field(description="标注结果")
 class ResultModel(BaseModel):
    taskId: str
    status: str
    message: str = Field("")
    recognition_results: Optional[StreamDataModel] = Field(None)
    @field_validator("recognition_results", mode="after")
    def convert_to_seconds(cls, v: Optional[StreamDataModel], values):
        # 在这里处理除以1000的逻辑
        if v is None:
            return v
        v.end_time = v.end_time / 1000
        v.start_time = v.start_time / 1000
        for word in v.words:
            word.start_time /= 1000
            word.end_time /= 1000
        return v
 class ClientCallback:
    def __init__(self, sut_url: str, port: int):
        self.sut_url = sut_url     #sut_url：ASR 服务的 URL（如 http://asr-service:8080）
        self.port = port           #port：当前客户端监听的端口（用于接收回调）
        #创建 Flask 应用并注册路由
        self.app = Flask(__name__)
        self.app.add_url_rule(
            "/api/asr/batch-callback/<taskId>",
            view_func=self.asr_callback,
            methods=["POST"],
        )
        self.app.add_url_rule(
            "/api/asr-runner/report",
            view_func=self.heartbeat,
            methods=["POST"],
        )
        """
        路由 1：/api/asr/batch-callback/<taskId>
            接收 ASR 服务的识别结果回调（self.asr_callback 处理）。
            taskId 是路径参数，用于标识具体任务。
        路由 2：/api/asr-runner/report
            接收 ASR 服务的心跳检测请求（self.heartbeat 处理）。
        """
        logging.getLogger("werkzeug").disabled = True
        threading.Thread(
            target=self.app.run, args=("0.0.0.0", port), daemon=True
        ).start()
        self.mutex = threading.Lock()
        self.finished = threading.Event()
        self.product_avaiable = True
        self.reset()
    def reset(self):
        self.begin_time = None
        self.end_time = None
        self.first_receive_time = None
        self.last_heartbeat_time = None
        self.app_on = False
        self.para_seq = 0
        self.finished.clear()
        self.error: Optional[str] = None
        self.last_recognition_result: Optional[StreamDataModel] = None
        self.recognition_results: List[StreamDataModel] = []
    def asr_callback(self, taskId: str):
        if self.app_on is False:
            abort(400)
        body = request.get_json(silent=True)   # 静默解析JSON，失败时返回None
        if body is None:
            abort(404)
        try:
            result = ResultModel.model_validate(body) #将回调的 JSON 数据解析为 ResultModel 对象，确保结构符合预期。
        except ValidationError as e:
            log.error("asr_callback: 结果格式错误: %s", e)
            abort(404)
        #处理任务完成状态（FINISHED）
        if result.status == "FINISHED":
            with self.mutex:
                self.stop()
            return "ok"
        #处理非运行状态（非 RUNNING）
        if result.status != "RUNNING":
            log.error(
                "asr_callback: 结果状态错误: %s, message: %s",
                result.status,
                result.message,
            )
            abort(404)
        recognition_result = result.recognition_results
        if recognition_result is None:
            log.error("asr_callback: 结果中没有recognition_results字段")
            abort(404)
        with self.mutex:    
            if not self.app_on:
                log.error("asr_callback: 应用已结束")
                abort(400)
            if recognition_result.para_seq < self.para_seq:
                error = "asr_callback: 结果中para_seq小于上一次的: %d < %d" % (
                    recognition_result.para_seq,
                    self.para_seq,
                )
                log.error(error)
                if self.error is None:
                    self.error = error
                    self.stop()
                abort(404)
            if recognition_result.para_seq > self.para_seq + 1:
                error = (
                    "asr_callback: 结果中para_seq大于上一次的+1 \
 说明存在para_seq = %d没有final_result为True确认"
                    % (self.para_seq + 1,)
                )
                log.error(error)
                if self.error is None:
                    self.error = error
                    self.stop()
                abort(404)
            if (
                self.last_recognition_result is not None
                and recognition_result.start_time
                < self.last_recognition_result.end_time
            ):
                error = "asr_callback: 结果中start_time小于上一次的end_time: %s < %s" % (
                    recognition_result.start_time,
                    self.last_recognition_result.end_time,
                )
                log.error(error)
                if self.error is None:
                    self.error = error
                    self.stop()
                abort(404)
            self.recognition_results.append(recognition_result)
            if recognition_result.final_result is True:
                self.para_seq = recognition_result.para_seq
                if self.last_recognition_result is None:
                    self.first_receive_time = time.time()
                self.last_recognition_result = recognition_result
        return "ok"
    """
    def heartbeat(self):
        if self.app_on is False:
            abort(400)
        body = request.get_json(silent=True)
        if body is None:
            abort(404)
        status = body.get("status")
        if status != "RUNNING":
            message = body.get("message", "")
            if message:
                message = ", message: " + message
            log.error("heartbeat: 状态错误: %s%s", status, message)
            return "ok"
        with self.mutex:
            self.last_heartbeat_time = time.time()
        return "ok"
    """
    def predict(
        self,
        language: Optional[str],
        audio_file: str,
        audio_duration: float,
        task_id: str,
    ):
        #使用互斥锁确保线程安全
        with self.mutex:
            if self.app_on:
                log.error("上一音频尚未完成处理，流程出现异常")
                raise StopException()
            self.reset()
            self.app_on = True
        #请求URL：self.sut_url + "/predict"（如 http://localhost:8080/predict）
        resp = requests.post(
            self.sut_url + "/predict",
            data={
                "language": language,
                "taskId": task_id,
                "progressCallbackUrl": "http://%s:%d/api/asr/batch-callback/%s"
                % (MY_POD_IP, self.port, task_id),
                "heartbeatUrl": "http://%s:%d/api/asr-runner/report" % (MY_POD_IP, self.port),
            },
            files={"file": (audio_file, open(audio_file, "rb"))},
            timeout=60,
        )
        #响应处理
        if resp.status_code != 200:
            log.error("/predict接口返回http code %s", resp.status_code)
            raise StopException()
        resp.raise_for_status()
        status = resp.json().get("status")
        if status != "OK":
            log.error("/predict接口返回非OK状态: %s", status)
            raise StopException()
        #辅助线程
        threading.Thread(
            target=self.dead_line_check, args=(audio_duration,), daemon=True
        ).start()
        threading.Thread(target=self.heartbeat_check, daemon=True).start()
    def dead_line_check(self, audio_duration: float):
        begin_time = time.time()
        self.begin_time = begin_time
        # 初始化 10s 延迟检测
        self.sleep_to(begin_time + 10)
        with self.mutex:
            if self.last_recognition_result is None:
                error = "首包延迟内未收到返回"
                log.error(error)
                if self.error is None:
                    self.error = error
                    self.stop()
                return
        # 第一次30s检测
        next_checktime = begin_time + 30
        ddl = begin_time + max((audio_duration / 6) + 10, 30)
        while time.time() < ddl:
            self.sleep_to(next_checktime)
            with self.mutex:
                if self.finished.is_set():
                    return
                if self.last_recognition_result is None:
                    error = "检测追赶线过程中获取最后一次识别结果异常"
                    log.error(error)
                    if self.error is None:
                        self.error = error
                        self.stop()
                        return
                last_end_time = self.last_recognition_result.end_time
            expect_end_time = (next_checktime - begin_time - 30) * 5.4
            if last_end_time < expect_end_time:
                log.warning(
                    "识别时间位置 %s 被死亡追赶线 %s 已追上，将置为产品不可用",
                    last_end_time,
                    expect_end_time,
                )
                self.product_avaiable = False
                self.sleep_to(ddl)
                break
            next_checktime = last_end_time / 5.4 + begin_time + 30 + 1
            next_checktime = min(next_checktime, ddl)
        with self.mutex:
            if self.finished.is_set():
                return
        log.warning("识别速度rtf低于1/6, 将置为产品不可用")
        self.product_avaiable = False
        self.sleep_to(begin_time + max((audio_duration / 3) + 10, 30))
        with self.mutex:
            if self.finished.is_set():
                return
            error = "处理时间超过ddl %s " % (ddl - begin_time)
            log.error(error)
            if self.error is None:
                self.error = error
                self.stop()
                return
    def heartbeat_check(self):
        self.last_heartbeat_time = time.time()
        while True:
            with self.mutex:
                if self.finished.is_set():
                    return
                if time.time() - self.last_heartbeat_time > 30:
                    error = "asr_runner 心跳超时 %s" % (
                        time.time() - self.last_heartbeat_time
                    )
                    log.error(error)
                    if self.error is None:
                        self.error = error
                        self.stop()
                        return
            time.sleep(5)
    def sleep_to(self, to: float):
        seconds = to - time.time()
        if seconds <= 0:
            return
        time.sleep(seconds)
    def stop(self):
        self.end_time = time.time()
        self.finished.set()
        self.app_on = False
    def evaluate(self, query_data: QueryData):
        log.info("开始评估")
        if (
            self.begin_time is None
            or self.end_time is None
            or self.first_receive_time is None
        ):
            if self.begin_time is None:
                log.error("评估流程异常 无开始时间")
            if self.end_time is None:
                log.error("评估流程异常 无结束时间")
            if self.first_receive_time is None:
                log.error("评估流程异常 无首次接收时间")
            raise StopException()
        rtf = max(self.end_time - self.begin_time - 10, 0) / query_data.duration
        first_receive_delay = max(self.first_receive_time - self.begin_time, 0)
        query_count = 1
        voice_count = len(query_data.voice)
        preds = self.recognition_results
        self.recognition_results = list(
            filter(lambda x: x.final_result, self.recognition_results)
        )
        (
            pred_punctuation_num,
            label_punctuation_num,
            pred_sentence_punctuation_num,
            label_setence_punctuation_num,
        ) = evaluate_punctuation(query_data, self.recognition_results)
        (
            cer,
            _,
            align_start,
            align_end,
            first_word_distance_sum,
            last_word_distance_sum,
        ) = evaluate_editops(query_data, self.recognition_results)
        if align_start[300] / voice_count < 0.8:
            log.warning(
                "评估结果首字300ms对齐率 %s < 0.8, 将置为产品不可用",
                align_start[300] / voice_count,
            )
            self.product_avaiable = False
        return EvaluateResult(
            lang=query_data.lang,
            cer=cer,
            align_start=align_start,
            align_end=align_end,
            first_word_distance_sum=first_word_distance_sum,
            last_word_distance_sum=last_word_distance_sum,
            rtf=rtf,
            first_receive_delay=first_receive_delay,
            query_count=query_count,
            voice_count=voice_count,
            pred_punctuation_num=pred_punctuation_num,
            label_punctuation_num=label_punctuation_num,
            pred_sentence_punctuation_num=pred_sentence_punctuation_num,
            label_setence_punctuation_num=label_setence_punctuation_num,
            preds=preds,
            label=query_data,
        )
--- a/utils/evaluate.py
+++ b/utils/evaluate.py
@@ -0,0 +1,445 @@
 import os
 import subprocess
 from collections import defaultdict
 from typing import Dict, List
 from utils import asr_ter
 from utils.logger import logger
 log_mid_result = int(os.getenv("log", 0)) == 1
 class AsrEvaluator:
    def __init__(self) -> None:
        self.query_count = 0  # query 数目（语音数目）
        self.voice_count = 0  # 有开始和结束时间的语音条数（用于 RTF 计算）
        self.cut_punc = []  # 切分标点符号，需要注意切分的时候根据列表中的顺序进行切分，比如 ... 应该放到 . 之前。
        # cer 属性
        self.one_minus_cer = 0  # 每个 query 的 1 - cer 和
        self.token_count = 0  # 每个 query 的字数/词数和
        # 句子切分率属性
        self.miss_count = 0  # 每个 query miss-count 和
        self.more_count = 0  # 每个 query more-count 和
        self.cut_count = 0  # 每个 query cut-count 和
        self.rate = 0  # 每个 query 的 cut-rate 和
        # detail case
        self.result = []
    def evaluate(self, eval_result):
        pass
    def post_evaluate(self):
        pass
    def gen_result(self) -> Dict:
        output_result = dict()
        output_result["query_count"] = self.query_count
        output_result["voice_count"] = self.voice_count
        output_result["token_cnt"] = self.token_count
        output_result["one_minus_cer"] = self.one_minus_cer
        output_result["one_minus_cer_metrics"] = self.one_minus_cer / self.query_count
        output_result["miss_count"] = self.miss_count
        output_result["more_count"] = self.more_count
        output_result["cut_count"] = self.cut_count
        output_result["cut_rate"] = self.rate
        output_result["cut_rate_metrics"] = self.rate / self.query_count
        output_result["rtf"] = self.rtf
        output_result["rtf_end"] = self.rtf_end
        output_result["rtf_metrics"] = self.rtf / self.voice_count
        output_result["rtf_end_metrics"] = self.rtf_end / self.voice_count
        detail_case = self.result
        return output_result, detail_case
    def _get_predict_final_sentences(self, predict_data: List[Dict]) -> List[str]:
        """
        获取 predict data 数据，然后将其中 final 的句子拿出来，放到列表里。
        """
        return [
            item["recoginition_results"]["text"]
            for item in predict_data
            if item["recoginition_results"]["final_result"]
        ]
    def _sentence_final_index(self, sentences: List[str], tokens: List[str], tokenizer="word") -> List[int]:
        """
        获取 sentence 结束的字对应的 token 索引值。
        """
        token_index_list = []
        token_idx = 0
        for sentence in sentences:
            for token in Tokenizer.tokenize(sentence, tokenizer):
                if token not in tokens:
                    continue
                while tokens[token_idx] != token:
                    token_idx += 1
            token_index_list.append(token_idx)
        return token_index_list
    def _voice_to_cut_sentence(self, voice_sentences: List[str]) -> Dict:
        """
        将数据集的语音片段转换为最小切分单元列表。
        使用 cut_punc 中的所有 punc 进行依次切分，最后去除掉完全空的内容
        示例：
        ["你好，你好呀", "你好，我在写抽象的代码逻辑"]
        ->
        cut_sentences: ["你好", "你好呀", "你好", "我在写抽象的代码逻辑"]
        cut_sentence_index_list: [1, 3] ("你好呀" 对应 1-idx, "我在写抽象的代码逻辑" 对应 3-idx)
        """
        voice_sentences_result = defaultdict(list)
        for voice_sentence in voice_sentences:
            sentence_list = [voice_sentence]
            sentence_tmp_list = []
            for punc in self.cut_punc:
                for sentence in sentence_list:
                    sentence_tmp_list.extend(sentence.split(punc))
                sentence_list, sentence_tmp_list = sentence_tmp_list, []
            sentence_list = [item for item in sentence_list if item]
            # 切分后的句子单元
            voice_sentences_result["cut_sentences"].extend(sentence_list)
            # 每个语音单元最后一个字对应的句子单元的索引
            voice_sentences_result["cut_sentence_index_list"].append(len(voice_sentences_result["cut_sentences"]) - 1)
        return voice_sentences_result
    def _voice_bytes_index(self, timestamp, sample_rate=16000, bit_depth=16, channels=1):
        """
        timestamp: 时间, 单位秒
        """
        bytes_per_sample = bit_depth // 8
        return timestamp * sample_rate * bytes_per_sample * channels
 class AsrZhEvaluator(AsrEvaluator):
    """
    中文的评估方式
    """
    def __init__(self):
        super().__init__()
        self.cut_zh_punc = ["······", "......", "。", "，", "？", "！", "；", "："]
        self.cut_en_punc = ["...", ".", ",", "?", "!", ";", ":"]
        self.cut_punc = self.cut_zh_punc + self.cut_en_punc
    def evaluate(self, eval_result) -> Dict:
        self.query_count += 1
        self.voice_count += len(eval_result["voice"])
        # 获取，标注结果 & 语音单元（非句子单元）
        label_voice_sentences = [item["answer"] for item in eval_result["voice"]]
        # print("label_voice_sentences", label_voice_sentences)
        # 获取，标注结果 & 语音单元 -> 句子单元的转换情况
        voice_to_cut_info = self._voice_to_cut_sentence(label_voice_sentences)
        # print("voice_to_cut_info", voice_to_cut_info)
        # 获取，标注结果 & 句子单元
        label_sentences = voice_to_cut_info["cut_sentences"]
        # 获取，标注结果 & 语音单元 -> 句子单元的映射关系，每个语音单元最后一个字对应的句子单元的索引
        cut_sentence_index_list = voice_to_cut_info["cut_sentence_index_list"]
        # 标注结果 & 句子单元 & norm 操作
        label_sentences = [self._sentence_norm(sentence) for sentence in label_sentences]
        if log_mid_result:
            logger.info(f"label_sentences {label_sentences}")
        # print("label_sentences", label_sentences)
        # 预测结果 & 句子单元
        predict_sentences_raw = self._get_predict_final_sentences(eval_result["predict_data"])
        # print("predict_sentences_raw", predict_sentences_raw)
        # 预测结果 & 句子单元 & norm 操作
        predict_sentences = [self._sentence_norm(sentence) for sentence in predict_sentences_raw]
        if log_mid_result:
            logger.info(f"predict_sentences {predict_sentences}")
        # print("predict_sentences", predict_sentences)
        # 基于最小编辑距离进行 token 匹配，获得匹配后的 token 列表
        label_tokens, predict_tokens = self._sentence_transfer("".join(label_sentences), "".join(predict_sentences))
        # cer 计算
        cer_info = self.cer(label_sentences, predict_sentences)
        if log_mid_result:
            logger.info(f"cer_info {cer_info}")
        # print("cer_info", cer_info)
        self.one_minus_cer += cer_info["one_minus_cer"]
        self.token_count += cer_info["token_count"]
        # 句子切分准召率
        cut_info = self.cut_rate(label_sentences, predict_sentences, label_tokens, predict_tokens)
        if log_mid_result:
            logger.info(f"{cut_info['miss_count']}, {cut_info['more_count']}, {cut_info['rate']}")
        # print("cut_info", cut_info)
        # print(cut_info["miss_count"], cut_info["more_count"], cut_info["rate"])
        self.miss_count += cut_info["miss_count"]
        self.more_count += cut_info["more_count"]
        self.cut_count += cut_info["cut_count"]
        self.rate += cut_info["rate"]
        self.result.append(
            {
                "label_tokens": label_tokens,
                "predict_tokens": predict_tokens,
                "one_minus_cer": cer_info["one_minus_cer"],
                "token_count": cer_info["one_minus_cer"],
                "miss_count": cut_info["miss_count"],
                "more_count": cut_info["more_count"],
                "cut_count": cut_info["cut_count"],
                "rate": cut_info["rate"],
            }
        )
    def cer(self, label_sentences, predict_sentences):
        pred_str = ''.join(predict_sentences) if predict_sentences is not None else ''
        label_str = ''.join(label_sentences)
        r = asr_ter.calc_ter_speechio(pred_str, label_str)
        one_minus_cer = max(1.0 - r['ter'], 0)
        token_count = r['ref_all_token_cnt']
        return {"one_minus_cer": one_minus_cer, "token_count": token_count}
    def cut_rate(self, label_sentences, predict_sentences, label_tokens, predict_tokens):
        label_final_index_list = set(self._sentence_final_index(label_sentences, label_tokens))
        pred_final_index_list = set(self._sentence_final_index(predict_sentences, predict_tokens))
        label_sentence_count = len(label_final_index_list)
        miss_count = len(label_final_index_list - pred_final_index_list)
        more_count = len(pred_final_index_list - label_final_index_list)
        rate = max(1 - (miss_count + more_count * 2) / label_sentence_count, 0)
        return {
            "miss_count": miss_count,
            "more_count": more_count,
            "cut_count": label_sentence_count,
            "rate": rate,
            "label_final_index_list": label_final_index_list,
            "pred_final_index_list": pred_final_index_list,
        }
    def _sentence_norm(self, sentence, tokenizer="word"):
        """
        对句子进行 norm 操作
        """
        from utils.speechio import textnorm_zh as textnorm
        if tokenizer == "word":
            normalizer = textnorm.TextNorm(
                to_banjiao=True,
                to_upper=True,
                to_lower=False,
                remove_fillers=True,
                remove_erhua=False,  # 这里同批量识别不同，改成了 False
                check_chars=False,
                remove_space=False,
                cc_mode="",
            )
            return normalizer(sentence)
        else:
            logger.error("tokenizer error, not support.")
    def _sentence_transfer(self, label_sentence: str, predict_sentence: str, tokenizer="char"):
        """
        基于最小编辑距离，将 label 和 predict 进行字的位置匹配，并生成转换后的结果
        args:
            label: "今天的通话质量不错呀昨天的呢"
            predict: "今天的通话质量不错昨天呢星期"
            tokenizer: 分词方式
        return:
            label:   ["今", "天", "的", "通", "话", "质", "量", "不", "错", "呀", "昨", "天", "的", "呢", None, None]
            predict: ["今", "天", "的", "通", "话", "质", "量", "不", "错", None, "昨", "天", None, "呢", "星", "期"]
        """
        from utils.speechio import error_rate_zh as error_rate
        if tokenizer == "char":
            alignment, score = error_rate.EditDistance(
                error_rate.tokenize_text(label_sentence, tokenizer),
                error_rate.tokenize_text(predict_sentence, tokenizer),
            )
            label_tokens, pred_tokens = [], []
            for align in alignment:
                # print(align.__dict__)
                label_tokens.append(align.ref)
                pred_tokens.append(align.hyp)
            return (label_tokens, pred_tokens)
        else:
            logger.error("tokenizer 出错了，暂时不支持其它的")
    def _pred_data_transfer(self, predict_data, recv_time):
        """
        predict_data = [
            {"recoginition_results": {"text": "1", "final_result": False, "para_seq": 0}},
            {"recoginition_results": {"text": "12", "final_result": False, "para_seq": 0}},
            {"recoginition_results": {"text": "123", "final_result": True, "para_seq": 0}},
            {"recoginition_results": {"text": "4", "final_result": False, "para_seq": 0}},
            {"recoginition_results": {"text": "45", "final_result": False, "para_seq": 0}},
            {"recoginition_results": {"text": "456", "final_result": True, "para_seq": 0}},
        ]
        recv_time = [1, 3, 5, 6, 7, 8]
        ->
        [
            [{'text': '1', 'time': 1}, {'text': '12', 'time': 3}, {'text': '123', 'time': 5}],
            [{'text': '4', 'time': 6}, {'text': '45', 'time': 7}, {'text': '456', 'time': 8}],
        ]
        """
        pred_sentence_info = []
        pred_sentence_index = 0
        for predict_item, recv_time_item in zip(predict_data, recv_time):
            if len(pred_sentence_info) == pred_sentence_index:
                pred_sentence_info.append([])
            pred_sentence_info[pred_sentence_index].append(
                {
                    "text": predict_item["recoginition_results"]["text"],
                    "time": recv_time_item,
                }
            )
            if predict_item["recoginition_results"]["final_result"]:
                pred_sentence_index += 1
        return pred_sentence_info
 class AsrEnEvaluator(AsrEvaluator):
    """
    英文的评估方式
    """
    def evaluate(self, eval_result) -> Dict:
        self.query_count += 1
        self.voice_count += len(eval_result["voice"])
        # 获取，标注结果 & 语音单元（非句子单元）
        label_voice_sentences = [item["answer"] for item in eval_result["voice"]]
        # print("label_voice_sentences", label_voice_sentences)
        # 获取，标注结果 & 语音单元 -> 句子单元的转换情况
        voice_to_cut_info = self._voice_to_cut_sentence(label_voice_sentences)
        # print("voice_to_cut_info", voice_to_cut_info)
        # 获取，标注结果 & 句子单元
        label_sentences = voice_to_cut_info["cut_sentences"]
        # 获取，标注结果 & 语音单元 -> 句子单元的映射关系，每个语音单元最后一个字对应的句子单元的索引
        cut_sentence_index_list = voice_to_cut_info["cut_sentence_index_list"]
        # 标注结果 & 句子单元 & norm 操作
        label_sentences = self._sentence_list_norm(label_sentences)
        # [self._sentence_norm(sentence) for sentence in label_sentences]
        # print("label_sentences", label_sentences)
        if log_mid_result:
            logger.info(f"label_sentences {label_sentences}")
        # 预测结果 & 句子单元
        predict_sentences_raw = self._get_predict_final_sentences(eval_result["predict_data"])
        # print("predict_sentences_raw", predict_sentences_raw)
        # 预测结果 & 句子单元 & norm 操作
        predict_sentences = self._sentence_list_norm(predict_sentences_raw)
        # [self._sentence_norm(sentence) for sentence in predict_sentences_raw]
        # print("predict_sentences", predict_sentences)
        if log_mid_result:
            logger.info(f"predict_sentences {predict_sentences}")
        label_tokens, predict_tokens = self._sentence_transfer(" ".join(label_sentences), " ".join(predict_sentences))
        # print(label_tokens)
        # print(predict_tokens)
        # cer 计算
        cer_info = self.cer(label_tokens, predict_tokens)
        # print("cer_info", cer_info)
        if log_mid_result:
            logger.info(f"cer_info {cer_info}")
        self.one_minus_cer += cer_info["one_minus_cer"]
        self.token_count += cer_info["token_count"]
        # 句子切分准召率
        cut_info = self.cut_rate(label_sentences, predict_sentences, label_tokens, predict_tokens)
        # print(cut_info["miss_count"], cut_info["more_count"], cut_info["rate"])
        # print("cut_info", cut_info)
        if log_mid_result:
            logger.info(f"{cut_info['miss_count']}, {cut_info['more_count']}, {cut_info['rate']}")
        self.miss_count += cut_info["miss_count"]
        self.more_count += cut_info["more_count"]
        self.cut_count += cut_info["cut_count"]
        self.rate += cut_info["rate"]
        self.result.append(
            {
                "label_tokens": label_tokens,
                "predict_tokens": predict_tokens,
                "one_minus_cer": cer_info["one_minus_cer"],
                "token_count": cer_info["one_minus_cer"],
                "miss_count": cut_info["miss_count"],
                "more_count": cut_info["more_count"],
                "cut_count": cut_info["cut_count"],
                "rate": cut_info["rate"],
            }
        )
    def cer(self, label_tokens, predict_tokens):
        s, d, i, c = 0, 0, 0, 0
        for label_token, predict_token in zip(label_tokens, predict_tokens):
            if label_token == predict_token:
                c += 1
            elif predict_token is None:
                d += 1
            elif label_token is None:
                i += 1
            else:
                s += 1
        cer = (s + d + i) / (s + d + c)
        one_minus_cer = max(1.0 - cer, 0)
        token_count = s + d + c
        return {"one_minus_cer": one_minus_cer, "token_count": token_count}
    def cut_rate(self, label_sentences, predict_sentences, label_tokens, predict_tokens):
        label_final_index_list = set(self._sentence_final_index(label_sentences, label_tokens, "whitespace"))
        pred_final_index_list = set(self._sentence_final_index(predict_sentences, predict_tokens, "whitespace"))
        label_sentence_count = len(label_final_index_list)
        miss_count = len(label_final_index_list - pred_final_index_list)
        more_count = len(pred_final_index_list - label_final_index_list)
        rate = max(1 - (miss_count + more_count * 2) / label_sentence_count, 0)
        return {
            "miss_count": miss_count,
            "more_count": more_count,
            "cut_count": label_sentence_count,
            "rate": rate,
            "label_final_index_list": label_final_index_list,
            "pred_final_index_list": pred_final_index_list,
        }
    def _sentence_list_norm(self, sentence_list, tokenizer="whitespace"):
        pwd = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        with open('./predict.txt', 'w', encoding='utf-8') as fp:
            for idx, sentence in enumerate(sentence_list):
                fp.write('%s\t%s\n' % (idx, sentence))
        subprocess.run(
            f'PYTHONPATH={pwd}/utils/speechio python {pwd}/utils/speechio/textnorm_en.py --has_key --to_upper ./predict.txt ./predict_norm.txt',
            shell=True,
            check=True,
        )
        sentence_norm = []
        with open('./predict_norm.txt', 'r', encoding='utf-8') as fp:
            for line in fp.readlines():
                line_split_result = line.strip().split('\t', 1)
                if len(line_split_result) >= 2:
                    sentence_norm.append(line_split_result[1])
                # 有可能没有 norm 后就没了
        return sentence_norm
    def _sentence_transfer(self, label_sentence: str, predict_sentence: str, tokenizer="whitespace"):
        """
        基于最小编辑距离，将 label 和 predict 进行字的位置匹配，并生成转换后的结果
        args:
            label: "HELLO WORLD ARE U OK YEP"
            predict: "HELLO WORLD U ARE U OK YEP"
            tokenizer: 分词方式
        return:
            label:   ["HELLO", "WORLD", None, "ARE", "U", "OK", "YEP"]
            predict: ["HELLO", "WORLD", "U",  "ARE", "U", "OK", "YEP"]
        """
        from utils.speechio import error_rate_zh as error_rate
        if tokenizer == "whitespace":
            alignment, score = error_rate.EditDistance(
                error_rate.tokenize_text(label_sentence, tokenizer),
                error_rate.tokenize_text(predict_sentence, tokenizer),
            )
            label_tokens, pred_tokens = [], []
            for align in alignment:
                label_tokens.append(align.ref)
                pred_tokens.append(align.hyp)
            return (label_tokens, pred_tokens)
        else:
            logger.error("tokenizer 出错了，暂时不支持其它的")
    def post_evaluate(self) -> Dict:
        pass
--- a/utils/evaluator.py
+++ b/utils/evaluator.py
@@ -0,0 +1,195 @@
 # coding: utf-8
 import os
 from collections import Counter, defaultdict
 from itertools import chain
 from typing import List
 from schemas.context import ASRContext
 from utils.logger import logger
 from utils.metrics import cer, cut_rate, cut_sentence, first_delay
 from utils.metrics import mean_on_counter, patch_unique_token_count
 from utils.metrics import revision_delay, text_align, token_mapping
 from utils.metrics import var_on_counter
 from utils.tokenizer import TOKENIZER_MAPPING, Tokenizer
 from utils.update_submit import change_product_available
 IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", 1) is None
 class BaseEvaluator:
    def __init__(self) -> None:
        self.query_count = 0  # query 数目（语音数目）
        self.voice_count = 0
        self.fail_count = 0  # 失败数目
        # 首字延迟
        self.first_delay_sum = 0
        self.first_delay_cnt = 0
        # 修正延迟
        self.revision_delay_sum = 0
        self.revision_delay_cnt = 0
        # patch token 信息
        self.patch_unique_cnt_counter = Counter()
        # text align count
        self.start_time_align_count = 0
        self.end_time_align_count = 0
        self.start_end_count = 0
        # 1-cer
        self.one_minus_cer = 0
        self.token_count = 0
        # 1-cer language
        self.one_minus_cer_lang = defaultdict(int)
        self.query_count_lang = defaultdict(int)
        # sentence-cut
        self.miss_count = 0
        self.more_count = 0
        self.sentence_count = 0
        self.cut_rate = 0
        # detail-case
        self.context = ASRContext()
        # 时延
        self.send_interval = []
        self.last_recv_interval = []
        # 字含量不达标数
        self.fail_char_contains_rate_num = 0
        # 标点符号
        self.punctuation_num = 0
        self.pred_punctuation_num = 0
    def evaluate(self, context: ASRContext):
        self.query_count += 1
        self.query_count_lang[context.lang] += 1
        voice_count = len(context.labels)
        self.voice_count += voice_count
        self.punctuation_num += context.punctuation_num
        self.pred_punctuation_num += context.pred_punctuation_num
        if not context.fail:
            # 首字延迟
            first_delay_sum, first_delay_cnt = first_delay(context)
            self.first_delay_sum += first_delay_sum
            self.first_delay_cnt += first_delay_cnt
            # 修正延迟
            revision_delay_sum, revision_delay_cnt = revision_delay(context)
            self.revision_delay_sum += revision_delay_sum
            self.revision_delay_cnt += revision_delay_cnt
            # patch token 信息
            counter = patch_unique_token_count(context)
            self.patch_unique_cnt_counter += counter
        else:
            self.fail_count += 1
        self.fail_char_contains_rate_num += context.fail_char_contains_rate_num
        # text align count
        start_time_align_count, end_time_align_count, start_end_count = text_align(context)
        self.start_time_align_count += start_time_align_count
        self.end_time_align_count += end_time_align_count
        self.start_end_count += start_end_count
        # cer, wer
        sentences_gt: List[str] = [item.answer for item in context.labels]
        sentences_dt: List[str] = [
            item.recognition_results.text for item in context.preds if item.recognition_results.final_result
        ]
        if IN_TEST:
            print(sentences_gt)
            print(sentences_dt)
        sentences_gt: List[str] = cut_sentence(sentences_gt, TOKENIZER_MAPPING.get(context.lang))
        sentences_dt: List[str] = cut_sentence(sentences_dt, TOKENIZER_MAPPING.get(context.lang))
        if IN_TEST:
            print(sentences_gt)
            print(sentences_dt)
        # norm & tokenize
        tokens_gt: List[List[str]] = Tokenizer.norm_and_tokenize(sentences_gt, context.lang)
        tokens_dt: List[List[str]] = Tokenizer.norm_and_tokenize(sentences_dt, context.lang)
        if IN_TEST:
            print(tokens_gt)
            print(tokens_dt)
        # cer
        tokens_gt_mapping, tokens_dt_mapping = token_mapping(list(chain(*tokens_gt)), list(chain(*tokens_dt)))
        one_minue_cer, token_count = cer(tokens_gt_mapping, tokens_dt_mapping)
        self.one_minus_cer += one_minue_cer
        self.token_count += token_count
        self.one_minus_cer_lang[context.lang] += one_minue_cer
        # cut-rate
        rate, sentence_cnt, miss_cnt, more_cnt = cut_rate(tokens_gt, tokens_dt, tokens_gt_mapping, tokens_dt_mapping)
        self.cut_rate += rate
        self.sentence_count += sentence_cnt
        self.miss_count += miss_cnt
        self.more_count += more_cnt
        # detail-case
        self.context = context
        # 时延
        if self.context.send_time_start_end and self.context.recv_time_start_end:
            send_interval = self.context.send_time_start_end[1] - self.context.send_time_start_end[0]
            recv_interval = self.context.recv_time_start_end[1] - self.context.send_time_start_end[0]
            self.send_interval.append(send_interval)
            self.last_recv_interval.append(recv_interval)
            logger.info(
                f"""第一次发送时间{self.context.send_time_start_end[0]}, \
                    最后一次发送时间{self.context.send_time_start_end[-1]}, \
                    发送间隔 {send_interval},
                    最后一次接收时间{self.context.recv_time_start_end[-1]}, \
                    接收间隔 {recv_interval}
                    """
            )
    def post_evaluate(self):
        pass
    def gen_result(self):
        result = {
            "query_count": self.query_count,
            "voice_count": self.voice_count,
            "pred_voice_count": self.first_delay_cnt,
            "first_delay_mean": self.first_delay_sum / self.first_delay_cnt if self.first_delay_cnt > 0 else 10,
            "revision_delay_mean": (
                self.revision_delay_sum / self.revision_delay_cnt if self.revision_delay_cnt > 0 else 10
            ),
            "patch_token_mean": mean_on_counter(self.patch_unique_cnt_counter),
            "patch_token_var": var_on_counter(self.patch_unique_cnt_counter),
            "start_time_align_count": self.start_time_align_count,
            "end_time_align_count": self.end_time_align_count,
            "start_time_align_rate": self.start_time_align_count / self.sentence_count,
            "end_time_align_rate": self.end_time_align_count / self.sentence_count,
            "start_end_count": self.start_end_count,
            "one_minus_cer": self.one_minus_cer / self.query_count,
            "token_count": self.token_count,
            "miss_count": self.miss_count,
            "more_count": self.more_count,
            "sentence_count": self.sentence_count,
            "cut_rate": self.cut_rate / self.query_count,
            "fail_count": self.fail_count,
            "send_interval": self.send_interval,
            "last_recv_interval": self.last_recv_interval,
            "fail_char_contains_rate_num": self.fail_char_contains_rate_num,
            "punctuation_rate": self.pred_punctuation_num / self.punctuation_num,
        }
        for lang in self.one_minus_cer_lang:
            result["one_minus_cer_" + lang] = \
                self.one_minus_cer_lang[lang] / self.query_count_lang[lang]
        if (
            result["first_delay_mean"]
            > float(os.getenv("FIRST_DELAY_THRESHOLD", "5"))
            or
            self.fail_char_contains_rate_num / self.voice_count > 0.1
            # or
            # result["punctuation_rate"] < 0.8
        ):
            change_product_available()
        return result
    def gen_detail_case(self):
        return self.context
--- a/utils/evaluator_plus.py
+++ b/utils/evaluator_plus.py
@@ -0,0 +1,293 @@
 from collections import defaultdict
 from copy import deepcopy
 from itertools import chain
 from typing import Dict, List, Tuple
 import Levenshtein
 from schemas.dataset import QueryData
 from schemas.stream import StreamDataModel, StreamWordsModel
 from utils.metrics import Tokenizer
 from utils.metrics_plus import replace_general_punc
 from utils.tokenizer import TOKENIZER_MAPPING
 def evaluate_editops(
    query_data: QueryData, recognition_results: List[StreamDataModel]
 ) -> Tuple[float, int, Dict[int, int], Dict[int, int], float, float]:
    """返回cer 句子总数 首字对齐情况 尾字对齐情况 首字时间差值和 尾字时间差值和
    对齐情况为 时间差值->对齐数"""
    recognition_results = deepcopy(recognition_results)
    lang = query_data.lang
    voices = query_data.voice
    sentences_pred = [
        recognition_result.text for recognition_result in recognition_results
    ]
    sentences_label = [item.answer for item in voices]
    tokenizer_type = TOKENIZER_MAPPING[lang]
    sentences_pred = replace_general_punc(sentences_pred, tokenizer_type)
    sentences_label = replace_general_punc(sentences_label, tokenizer_type)
    # norm & tokenize
    tokens_pred = Tokenizer.norm_and_tokenize(sentences_pred, lang)
    tokens_label = Tokenizer.norm_and_tokenize(sentences_label, lang)
    normed_words = []
    for recognition_result in recognition_results:
        words = list(map(lambda x: x.text, recognition_result.words))
        normed_words.extend(words)
    normed_words = replace_general_punc(normed_words, tokenizer_type)
    normed_words = Tokenizer.norm(normed_words, lang)
    # 预测中的结果进行相同的norm和tokenize操作
    normed_word_index = 0
    for recognition_result in recognition_results:
        next_index = normed_word_index + len(recognition_result.words)
        tokens_words = Tokenizer.tokenize(
            normed_words[normed_word_index:next_index], lang
        )
        normed_word_index = next_index
        stream_words: List[StreamWordsModel] = []
        # 将原words进行norm和tokenize操作后赋值为对应原word的时间
        for raw_stream_word, tokens_word in zip(
            recognition_result.words, tokens_words
        ):
            for word in tokens_word:
                stream_words.append(
                    StreamWordsModel(
                        text=word,
                        start_time=raw_stream_word.start_time,
                        end_time=raw_stream_word.end_time,
                    )
                )
        recognition_result.words = stream_words
    # 将words对应上对分词后的词，从而使得分词后的词有时间
    pred_word_time: List[StreamWordsModel] = []
    for token_pred, recognition_result in zip(tokens_pred, recognition_results):
        word_index = 0
        for word in recognition_result.words:
            try:
                token_index = token_pred.index(word.text, word_index)
                for i in range(word_index, token_index + 1):
                    pred_word_time.append(
                        StreamWordsModel(
                            text=token_pred[i],
                            start_time=word.start_time,
                            end_time=word.end_time,
                        )
                    )
                word_index = token_index + 1
            except ValueError:
                pass
        if len(recognition_result.words) > 0:
            word = recognition_result.words[-1]
            start_time = word.start_time
            end_time = word.end_time
        else:
            start_time = recognition_result.start_time
            end_time = recognition_result.end_time
        for i in range(word_index, len(token_pred)):
            pred_word_time.append(
                StreamWordsModel(
                    text=token_pred[i],
                    start_time=start_time,
                    end_time=end_time,
                )
            )
    # 记录label每句话的首字尾字对应分词后的位置
    index = 0
    label_firstword_index: List[int] = []
    label_lastword_index: List[int] = []
    for token_label in tokens_label:
        label_firstword_index.append(index)
        index += len(token_label)
        label_lastword_index.append(index - 1)
    # cer
    flat_tokens_pred = list(chain(*tokens_pred))
    flat_tokens_label = list(chain(*tokens_label))
    ops = Levenshtein.editops(flat_tokens_pred, flat_tokens_label)
    insert = len(list(filter(lambda x: x[0] == "insert", ops)))
    delete = len(list(filter(lambda x: x[0] == "delete", ops)))
    replace = len(list(filter(lambda x: x[0] == "replace", ops)))
    cer = (insert + delete + replace) / len(flat_tokens_label)
    # 计算每个token在编辑后的下标位置
    pred_offset = [0] * (len(flat_tokens_pred) + 1)
    label_offset = [0] * (len(flat_tokens_label) + 1)
    for op in ops:
        if op[0] == "insert":
            pred_offset[op[1]] += 1
        elif op[0] == "delete":
            label_offset[op[2]] += 1
    pred_indexs = [pred_offset[0]]
    for i in range(1, len(flat_tokens_pred)):
        pred_indexs.append(pred_indexs[i - 1] + pred_offset[i] + 1)
    label_indexs = [label_offset[0]]
    for i in range(1, len(flat_tokens_label)):
        label_indexs.append(label_indexs[i - 1] + label_offset[i] + 1)
    # 计算每个label中首字和尾字对应的时间
    align_start = {100: 0, 200: 0, 300: 0, 500: 0}
    align_end = {100: 0, 200: 0, 300: 0, 500: 0}
    first_word_distance_sum = 0.0
    last_word_distance_sum = 0.0
    for firstword_index, lastword_index, voice in zip(
        label_firstword_index, label_lastword_index, voices
    ):
        label_index = label_indexs[firstword_index]
        label_in_pred_index = upper_bound(label_index, pred_indexs)
        if label_in_pred_index != -1:
            distance = abs(
                voice.start - pred_word_time[label_in_pred_index].start_time
            )
            if label_in_pred_index > 0:
                distance = min(
                    distance,
                    abs(
                        voice.start
                        - pred_word_time[label_in_pred_index - 1].start_time
                    ),
                )
        else:
            distance = abs(voice.start - pred_word_time[-1].start_time)
        for limit in align_start.keys():
            if distance <= limit / 1000:
                align_start[limit] += 1
        first_word_distance_sum += distance
        label_index = label_indexs[lastword_index]
        label_in_pred_index = lower_bound(label_index, pred_indexs)
        if label_in_pred_index != -1:
            distance = abs(
                voice.end - pred_word_time[label_in_pred_index].end_time
            )
            if label_in_pred_index < len(pred_word_time) - 1:
                distance = min(
                    distance,
                    abs(
                        voice.end
                        - pred_word_time[label_in_pred_index + 1].end_time
                    ),
                )
        else:
            distance = abs(voice.end - pred_word_time[0].end_time)
        for limit in align_end.keys():
            if distance <= limit / 1000:
                align_end[limit] += 1
        last_word_distance_sum += distance
    return (
        cer,
        len(voices),
        align_start,
        align_end,
        first_word_distance_sum,
        last_word_distance_sum,
    )
 def evaluate_punctuation(
    query_data: QueryData, recognition_results: List[StreamDataModel]
 ) -> Tuple[int, int, int, int]:
    """评估标点符号指标 返回预测中标点数 label中标点数 预测中句子标点数 label中句子标点数"""
    punctuation_mapping = defaultdict(lambda: [",", ".", "!", "?"])
    punctuation_mapping.update(
        {
            "zh": ["，", "。", "！", "？"],
            "ja": ["、", "。", "！", "？"],
            "ar": ["،", ".", "!", "؟"],
            "fa": ["،", ".", "!", "؟"],
            "el": [",", ".", "！", "；"],
            "ti": ["།"],
            "th": [" ", ",", ".", "!", "?"],
        }
    )
    punctuation_words: List[StreamWordsModel] = []
    for recognition_result in recognition_results:
        punctuations = punctuation_mapping[query_data.lang]
        for word in recognition_result.words:
            for char in word.text:
                if char in punctuations:
                    punctuation_words.append(word)
                    break
    punctuation_start_times = list(
        map(lambda x: x.start_time, punctuation_words)
    )
    punctuation_start_times = sorted(punctuation_start_times)
    punctuation_end_times = list(map(lambda x: x.end_time, punctuation_words))
    punctuation_end_times = sorted(punctuation_end_times)
    voices = query_data.voice
    label_len = len(voices)
    pred_punctuation_num = len(punctuation_words)
    label_punctuation_num = 0
    for label_voice in voices:
        punctuations = punctuation_mapping[query_data.lang]
        for char in label_voice.answer:
            if char in punctuations:
                label_punctuation_num += 1
    pred_sentence_punctuation_num = 0
    label_setence_punctuation_num = label_len
    for i, label_voice in enumerate(voices):
        if i < label_len - 1:
            label_left = label_voice.end
            label_right = voices[i + 1].start
        else:
            label_left = label_voice.end - 0.7
            label_right = label_voice.end + 0.7
        left_in_pred = upper_bound(label_left, punctuation_start_times)
        exist = False
        if (
            left_in_pred != -1
            and punctuation_start_times[left_in_pred] <= label_right
        ):
            exist = True
        right_in_pred = lower_bound(label_right, punctuation_end_times)
        if (
            right_in_pred != -1
            and punctuation_end_times[right_in_pred] >= label_left
        ):
            exist = True
        if exist:
            pred_sentence_punctuation_num += 1
    return (
        pred_punctuation_num,
        label_punctuation_num,
        pred_sentence_punctuation_num,
        label_setence_punctuation_num,
    )
 def upper_bound(x: float, lst: List[float]) -> int:
    """第一个 >= x 的元素的下标 没有返回-1"""
    ans = -1
    left, right = 0, len(lst) - 1
    while left <= right:
        mid = (left + right) // 2
        if lst[mid] >= x:
            ans = mid
            right = mid - 1
        else:
            left = mid + 1
    return ans
 def lower_bound(x: float, lst: List[float]) -> int:
    """最后一个 <= x 的元素的下标 没有返回-1"""
    ans = -1
    left, right = 0, len(lst) - 1
    while left <= right:
        mid = (left + right) // 2
        if lst[mid] <= x:
            ans = mid
            left = mid + 1
        else:
            right = mid - 1
    return ans
--- a/utils/file.py
+++ b/utils/file.py
@@ -0,0 +1,151 @@
 import json
 import os
 import shutil
 import tarfile
 import tempfile
 import zipfile
 from typing import Any
 import yaml
 def load_json(path: str, raise_for_invalid: bool = False) -> Any:
    """读取path json文件转为对象"""
    with open(path, "r", encoding="utf-8") as f:
        if raise_for_invalid:
            def parse_constant(s: str):
                raise ValueError("非法json字符: %s" % s)
            return json.load(f, parse_constant=parse_constant)
        return json.load(f)
 def dump_json(path: str, obj: Any):
    """将obj对象以json形式写入path文件"""
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)
 def load_yaml(path: str) -> Any:
    """读取path yaml文件转为对象"""
    with open(path, "r", encoding="utf-8") as f:
        return yaml.full_load(f)
 def dump_yaml(path: str, obj: Any):
    """将obj对象以yaml形式写入path文件"""
    with open(path, "w", encoding="utf-8") as f:
        yaml.dump(obj, f, indent=2, allow_unicode=True, sort_keys=False, line_break="\n")
 def dumps_yaml(obj: Any) -> str:
    """将obj对象以yaml形式导出为字符串"""
    return yaml.dump(obj, indent=2, allow_unicode=True, sort_keys=False, line_break="\n")
 def read_file(path: str) -> str:
    """读取文件为str"""
    with open(path, "r") as f:
        return f.read()
 def write_bfile(path: str, data: bytes):
    """将bytes data写入path文件"""
    with open(path, "wb") as f:
        f.write(data)
 def write_file(path: str, data: str):
    """将str data写入path文件"""
    with open(path, "w") as f:
        f.write(data)
 def tail_file(path: str, tail: int) -> str:
    """倍增获取文件path最后tail行"""
    block = 1024
    with open(path, "rb") as f:
        f.seek(0, 2)
        filesize = f.tell()
        while True:
            if filesize < block:
                block = filesize
            f.seek(filesize - block, 0)
            lines = f.readlines()
            if len(lines) > tail or filesize <= block:
                return "".join(line.decode() for line in lines[-tail:])
            block *= 2
 def zip_dir(zip_path: str, dirname: str):
    """将dirname制作为zip_path压缩包"""
    with zipfile.ZipFile(zip_path, "w") as ziper:
        for path, _, files in os.walk(dirname):
            for file in files:
                ziper.write(
                    os.path.join(path, file), os.path.join(path.removeprefix(dirname), file), zipfile.ZIP_DEFLATED
                )
 def zip_files(name: str, zipfile_paths: list):
    """将zipfiles_paths=list[文件名, 文件路径]制作为name压缩包"""
    with zipfile.ZipFile(name, "w") as ziper:
        for arcname, zipfile_path in zipfile_paths:
            ziper.write(zipfile_path, arcname, zipfile.ZIP_DEFLATED)
 def zip_strs(name: str, zipfile_strs: list):
    """将zipfile_strs=list[文件名, 内容]制作为name压缩包"""
    with zipfile.ZipFile(name, "w") as ziper:
        for filename, content in zipfile_strs:
            ziper.writestr(filename, content)
 def zip_zipers(name: str, ziper_paths: list):
    """将ziper_paths=list[压缩后名称, 压缩包/文件位置]制作为name压缩包"""
    temp_dirname = tempfile.mkdtemp(prefix=name, dir=os.path.dirname(name))
    os.makedirs(temp_dirname, exist_ok=True)
    for subname, ziper_path in ziper_paths:
        sub_dirname = os.path.join(temp_dirname, subname)
        if not os.path.exists(ziper_path):
            continue
        if zipfile.is_zipfile(ziper_path):
            # 压缩包解压
            os.makedirs(sub_dirname, exist_ok=True)
            unzip_dir(ziper_path, sub_dirname)
        elif os.path.isfile(ziper_path):
            # 文件
            shutil.copyfile(ziper_path, sub_dirname)
        else:
            # 文件夹
            shutil.copytree(ziper_path, sub_dirname)
    zip_dir(name, temp_dirname)
    shutil.rmtree(temp_dirname)
 def unzip_dir(zip_path: str, dirname: str, catch_exc: bool = True):
    """将zip_path解压到dirname"""
    with zipfile.ZipFile(zip_path, "r") as ziper:
        try:
            ziper.extractall(dirname)
        except Exception as e:
            if catch_exc:
                write_file(os.path.join(dirname, "unzip_error.log"), "%r" % e)
                shutil.copyfile(zip_path, os.path.join(dirname, os.path.basename(zip_path)))
            else:
                raise e
 def tar_dir(zip_path: str, dirname: str):
    """将dirname压缩到zip_path"""
    with tarfile.open(zip_path, "w:gz") as ziper:
        for path, _, files in os.walk(dirname):
            for file in files:
                ziper.add(os.path.join(path, file), os.path.join(path.removeprefix(dirname), file))
 def untar_dir(zip_path: str, dirname: str):
    """将zip_path解压到dirname"""
    with tarfile.open(zip_path) as ziper:
        ziper.extractall(dirname)
--- a/utils/helm.py
+++ b/utils/helm.py
@@ -0,0 +1,331 @@
 # -*- coding: utf-8 -*-
 import copy
 import io
 import json
 import os
 import re
 import tarfile
 import time
 from collections import defaultdict
 from typing import Any, Dict, Optional
 import requests
 from ruamel.yaml import YAML
 from utils.logger import logger
 sut_chart_root = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "helm-chart", "sut")
 headers = (
    {'Authorization': 'Bearer ' + os.getenv("LEADERBOARD_API_TOKEN")} if os.getenv("LEADERBOARD_API_TOKEN") else None
 )
 pull_num: defaultdict = defaultdict()
 JOB_ID = int(os.getenv("JOB_ID", "-1"))
 LOAD_SUT_URL = os.getenv("LOAD_SUT_URL")
 GET_JOB_SUT_INFO_URL = os.getenv("GET_JOB_SUT_INFO_URL")
 def apply_env_to_values(values, envs):
    if "env" not in values:
        values["env"] = []
    old_key_list = [x["name"] for x in values["env"]]
    for k, v in envs.items():
        try:
            idx = old_key_list.index(k)
            values["env"][idx]["value"] = v
        except ValueError:
            values["env"].append({"name": k, "value": v})
    return values
 def merge_values(base_value, incr_value):
    if isinstance(base_value, dict) and isinstance(incr_value, dict):
        for k in incr_value:
            base_value[k] = merge_values(base_value[k], incr_value[k]) if k in base_value else incr_value[k]
    elif isinstance(base_value, list) and isinstance(incr_value, list):
        base_value.extend(incr_value)
    else:
        base_value = incr_value
    return base_value
 def gen_chart_tarball(docker_image):
    """docker image加上digest并根据image生成helm chart包, 失败直接异常退出
    Args:
        docker_image (_type_): docker image
    Returns:
        tuple[BytesIO, dict]: [helm chart包file对象, values内容]
    """
    # load values template
    with open(os.path.join(sut_chart_root, "values.yaml.tmpl")) as fp:
        yaml = YAML(typ="rt")
        values = yaml.load(fp)
    # update docker_image
    get_image_hash_url = os.getenv("GET_IMAGE_HASH_URL", None)
    logger.info(f"get_image_hash_url: {get_image_hash_url}")
    if get_image_hash_url is not None:
        # convert tag to hash for docker_image
        #docker_image = "harbor-contest.4pd.io/zhoushasha/speaker_identification:wo_model_v0"
        docker_image = "harbor-contest.4pd.io/zhoushasha/image_classification:wo_model_v3"
        resp = requests.get(get_image_hash_url, headers=headers, params={"image": docker_image}, timeout=600)
        logger.info(f"resp.text: {resp.text}")
        assert resp.status_code == 200, "Convert tag to hash for docker image failed, API retcode %d" % resp.status_code
        resp = resp.json()
        assert resp["success"], "Convert tag to hash for docker image failed, response: %s" % str(resp)
        token = resp["data"]["image"].rsplit(":", 2)
        assert len(token) == 3, "Invalid docker image %s" % resp["data"]["image"]
        values["image"]["repository"] = token[0]
        values["image"]["tag"] = ":".join(token[1:])
    else:
        token = docker_image.rsplit(":", 1)
        if len(token) != 2:
            raise RuntimeError("Invalid docker image %s" % docker_image)
        values["image"]["repository"] = token[0]
        values["image"]["tag"] = token[1]
    # output values.yaml
    with open(os.path.join(sut_chart_root, "values.yaml"), "w") as fp:
        yaml = YAML(typ="rt")
        yaml.dump(values, fp)
    # tarball
    tarfp = io.BytesIO()
    with tarfile.open(fileobj=tarfp, mode="w:gz") as tar:
        tar.add(sut_chart_root, arcname=os.path.basename(sut_chart_root), recursive=True)
    tarfp.seek(0)
    logger.debug(f"Generated chart using values: {values}")
    return tarfp, values
 def deploy_chart(
    name_suffix,
    readiness_timeout,
    chart_str=None,
    chart_fileobj=None,
    extra_values=None,
    restart_count_limit=3,
    pullimage_count_limit=3,
 ):
    """部署sut, 失败直接异常退出
    Args:
        name_suffix (str): 同一个job有多个sut时, 区分不同sut的名称
        readiness_timeout (int): readiness超时时间, 单位s
        chart_str (int, optional): chart url, 不为None则忽略chart_fileobj. Defaults to None.
        chart_fileobj (BytesIO, optional): helm chart包file对象, chart_str不为None使用. Defaults to None.
        extra_values (dict, optional): helm values的补充内容. Defaults to None.
        restart_count_limit (int, optional): sut重启次数限制, 超出则异常退出. Defaults to 3.
        pullimage_count_limit (int, optional): image拉取次数限制, 超出则异常退出. Defaults to 3.
    Returns:
        tuple[str, str]: [用于访问服务的k8s域名, 用于unload_sut的名称]
    """
    logger.info(f"Deploying SUT application for JOB {JOB_ID}, name_suffix {name_suffix}, extra_values {extra_values}")
    # deploy
    payload = {
        "job_id": JOB_ID,
        "resource_name": name_suffix,
        "priorityclassname": os.environ.get("priorityclassname"),
    }
    extra_values = {} if not extra_values else extra_values
    payload["values"] = json.dumps(extra_values, ensure_ascii=False)
    if chart_str is not None:
        payload["helm_chart"] = chart_str
        resp = requests.post(LOAD_SUT_URL, data=payload, headers=headers, timeout=600)
    else:
        assert chart_fileobj is not None, "Either chart_str or chart_fileobj should be set"
        logger.info(f"LOAD_SUT_URL: {LOAD_SUT_URL}")
        logger.info(f"payload: {payload}")
        logger.info(f"headers: {headers}")
        resp = requests.post(
            LOAD_SUT_URL,
            data=payload,
            headers=headers,
            files=[("helm_chart_file", (name_suffix + ".tgz", chart_fileobj))],
            timeout=600,
        )
    if resp.status_code != 200:
        raise RuntimeError("Failed to deploy application status_code %d %s" % (resp.status_code, resp.text))
    resp = resp.json()
    if not resp["success"]:
        logger.error("Failed to deploy application response %r", resp)
    service_name = resp["data"]["service_name"]
    sut_name = resp["data"]["sut_name"]
    logger.info(f"SUT application deployed with service_name {service_name}")
    # waiting for appliation ready
    running_at = None
    retry_count = 0
    while True:
        retry_interval = 10
        if retry_count % 20 == 19:
            retry_count += 1
            logger.info(f"Waiting {retry_interval} seconds to check whether SUT application {service_name} is ready...")
            logger.info("20 retrys log this message again.")
        time.sleep(retry_interval)
        check_result, running_at = check_sut_ready_from_resp(
            service_name,
            running_at,
            readiness_timeout,
            restart_count_limit,
            pullimage_count_limit,
        )
        if check_result:
            break
    logger.info(f"SUT application for JOB {JOB_ID} name_suffix {name_suffix} is ready, service_name {service_name}")
    return service_name, sut_name
 def check_sut_ready_from_resp(
    service_name,
    running_at,
    readiness_timeout,
    restart_count_limit,
    pullimage_count_limit,
 ):
    try:
        resp = requests.get(
            f"{GET_JOB_SUT_INFO_URL}/{JOB_ID}",
            headers=headers,
            params={"with_detail": True},
            timeout=600,
        )
    except Exception as e:
        logger.warning(f"Exception occured while getting SUT application {service_name} status", e)
        return False, running_at
    if resp.status_code != 200:
        logger.warning(f"Get SUT application {service_name} status failed with status_code {resp.status_code}")
        return False, running_at
    resp = resp.json()
    if not resp["success"]:
        logger.warning(f"Get SUT application {service_name} status failed with response {resp}")
        return False, running_at
    if len(resp["data"]["sut"]) == 0:
        logger.warning("Empty SUT application status")
        return False, running_at
    resp_data_sut = copy.deepcopy(resp["data"]["sut"])
    for status in resp_data_sut:
        del status["detail"]
    logger.info(f"Got SUT application status: {resp_data_sut}")
    for status in resp["data"]["sut"]:
        if status["phase"] in ["Succeeded", "Failed"]:
            raise RuntimeError(f"Some pods of SUT application {service_name} terminated with status {status}")
        elif status["phase"] in ["Pending", "Unknown"]:
            return False, running_at
        elif status["phase"] != "Running":
            raise RuntimeError(f"Unexcepted pod status {status} of SUT application {service_name}")
        if running_at is None:
            running_at = time.time()
        for ct in status["detail"]["status"]["container_statuses"]:
            if ct["restart_count"] > 0:
                logger.info(f"pod {status['pod_name']} restart count = {ct['restart_count']}")
            if ct["restart_count"] > restart_count_limit:
                raise RuntimeError(f"pod {status['pod_name']} restart too many times(over {restart_count_limit})")
            if (
                ct["state"]["waiting"] is not None
                and "reason" in ct["state"]["waiting"]
                and ct["state"]["waiting"]["reason"] in ["ImagePullBackOff", "ErrImagePull"]
            ):
                pull_num[status["pod_name"]] += 1
                logger.info(
                    "pod %s has {pull_num[status['pod_name']]} times inspect pulling image info: %s"
                    % (status["pod_name"], ct["state"]["waiting"])
                )
                if pull_num[status["pod_name"]] > pullimage_count_limit:
                    raise RuntimeError(f"pod {status['pod_name']} cannot pull image")
        if not status["conditions"]["Ready"]:
            if running_at is not None and time.time() - running_at > readiness_timeout:
                raise RuntimeError(f"SUT Application readiness has exceeded readiness_timeout:{readiness_timeout}s")
            return False, running_at
    return True, running_at
 def parse_resource(resource):
    if resource == -1:
        return -1
    match = re.match(r"([\d\.]+)([mKMGTPENi]*)", resource)
    value, unit = match.groups()
    value = float(value)
    unit_mapping = {
        "": 1,
        "m": 1e-3,
        "K": 1e3,
        "M": 1e6,
        "G": 1e9,
        "T": 1e12,
        "P": 1e15,
        "E": 1e18,
        "Ki": 2**10,
        "Mi": 2**20,
        "Gi": 2**30,
        "Ti": 2**40,
        "Pi": 2**50,
        "Ei": 2**60,
    }
    if unit not in unit_mapping:
        raise ValueError(f"Unknown resources unit: {unit}")
    return value * unit_mapping[unit]
 def limit_resources(resource):
    if "limits" not in resource:
        return resource
    if "cpu" in resource["limits"]:
        cpu_limit = parse_resource(resource["limits"]["cpu"])
        if cpu_limit > 30:
            logger.error("CPU limit exceeded. Adjusting to 30 cores.")
            resource["limits"]["cpu"] = "30"
    if "memory" in resource["limits"]:
        memory_limit = parse_resource(resource["limits"]["memory"])
        if memory_limit > 100 * 2**30:
            logger.error("Memory limit exceeded, adjusting to 100Gi")
            resource["limits"]["memory"] = "100Gi"
 def consistent_resources(resource):
    if "limits" not in resource and "requests" not in resource:
        return resource
    elif "limits" in resource:
        resource["requests"] = resource["limits"]
    else:
        resource["limits"] = resource["requests"]
    return resource
 def resource_check(values: Dict[str, Any]):
    resources = values.get("resources", {}).get("limits", {})
    if "nvidia.com/gpu" in resources and int(resources["nvidia.com/gpu"]) > 0:
        values["resources"]["limits"]["nvidia.com/gpumem"] = 8192
        values["resources"]["limits"]["nvidia.com/gpucores"] = 10
        values["resources"]["requests"] = values["resources"].get("requests", {})
        if "cpu" not in values["resources"]["requests"] and "cpu" in values["resources"]["limits"]:
            values["resources"]["requests"]["cpu"] = values["resources"]["limits"]["cpu"]
        if "memory" not in values["resources"]["requests"] and "memory" in values["resources"]["limits"]:
            values["resources"]["requests"]["memory"] = values["resources"]["limits"]["memory"]
        values["resources"]["requests"]["nvidia.com/gpu"] = values["resources"]["limits"]["nvidia.com/gpu"]
        values["resources"]["requests"]["nvidia.com/gpumem"] = 8192
        values["resources"]["requests"]["nvidia.com/gpucores"] = 10
        values["nodeSelector"] = values.get("nodeSelector", {})
        if "contest.4pd.io/accelerator" not in values["nodeSelector"]:
            values["nodeSelector"]["contest.4pd.io/accelerator"] = "A100-SXM4-80GBvgpu"
        gpu_type = values["nodeSelector"]["contest.4pd.io/accelerator"]
        gpu_num = resources["nvidia.com/gpu"]
        if gpu_type != "A100-SXM4-80GBvgpu":
            raise RuntimeError("GPU类型只能为A100-SXM4-80GBvgpu")
        if gpu_num != 1:
            raise RuntimeError("GPU个数只能为1")
        values["tolerations"] = values.get("tolerations", [])
        values["tolerations"].append(
            {
                "key": "hosttype",
                "operator": "Equal",
                "value": "vgpu",
                "effect": "NoSchedule",
            }
        )
    return values
--- a/utils/leaderboard.py
+++ b/utils/leaderboard.py
@@ -0,0 +1,38 @@
 from utils.request import requests_retry_session
 import os
 import json
 import traceback
 from utils.logger import logger
 lb_headers = {"Content-Type":"application/json"}
 if os.getenv("LEADERBOARD_API_TOKEN"):
    lb_headers['Authorization'] = 'Bearer ' + os.getenv("LEADERBOARD_API_TOKEN")
 def change_product_unavailable() -> None:
    logger.info("更改为产品不可用...")
    submit_id = str(os.getenv("SUBMIT_ID", -1))
    try:
        requests_retry_session().post(
            os.getenv("UPDATE_SUBMIT_URL", "http://contest.4pd.io:8080/submit/update"),
            data=json.dumps({submit_id: {"product_avaliable": 0}}),
            headers=lb_headers,
        )
    except Exception as e:
        logger.error(traceback.format_exc())
        logger.error(f"change product avaliable error, {e}")
 def mark_evaluating(task_id) -> None:
    logger.info("上报EVALUATING状态...")
    job_id = os.getenv('JOB_ID') or "-1"
    url = os.getenv("REGISTER_MARK_TASK_URL", "http://contest.4pd.io:8080/job/register_mark_task") + "/" + job_id
    try:
        requests_retry_session().post(
            url,
            data=json.dumps({"task_id": task_id}),
            headers=lb_headers,
        )
    except Exception as e:
        logger.error(traceback.format_exc())
        logger.error(f"mark evaluating error, {e}")
--- a/utils/logger.py
+++ b/utils/logger.py
@@ -0,0 +1,30 @@
 # -*- coding: utf-8 -*-
 import logging
 import os
 logging.basicConfig(
    format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=os.environ.get("LOGLEVEL", "INFO"),
 )
 logger = logging.getLogger(__file__)
 # another logger
 log = logging.getLogger("detailed_logger")
 log.propagate = False
 level = logging.INFO
 log.setLevel(level)
 formatter = logging.Formatter(
    "[%(asctime)s] %(levelname)s : %(pathname)s:%(lineno)d - %(message)s",
    "%Y-%m-%d %H:%M:%S",
 )
 streamHandler = logging.StreamHandler()
 streamHandler.setLevel(level)
 streamHandler.setFormatter(formatter)
 log.addHandler(streamHandler)
--- a/utils/metrics.py
+++ b/utils/metrics.py
@@ -0,0 +1,320 @@
 # coding: utf-8
 import os
 from collections import Counter
 from copy import deepcopy
 from typing import List, Tuple
 import Levenshtein
 import numpy as np
 from schemas.context import ASRContext
 from utils.logger import logger
 from utils.tokenizer import Tokenizer, TokenizerType
 from utils.update_submit import change_product_available
 IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", None) is None
 def text_align(context: ASRContext) -> Tuple:
    start_end_count = 0
    label_start_time_list = []
    label_end_time_list = []
    for label_item in context.labels:
        label_start_time_list.append(label_item.start)
        label_end_time_list.append(label_item.end)
    pred_start_time_list = []
    pred_end_time_list = []
    sentence_start = True
    for pred_item in context.preds:
        if sentence_start:
            pred_start_time_list.append(pred_item.recognition_results.start_time)
        if pred_item.recognition_results.final_result:
            pred_end_time_list.append(pred_item.recognition_results.end_time)
        sentence_start = pred_item.recognition_results.final_result
    # check start0 < end0 < start1 < end1 < start2 < end2 - ...
    if IN_TEST:
        print(pred_start_time_list)
        print(pred_end_time_list)
    pred_time_list = []
    i, j = 0, 0
    while i < len(pred_start_time_list) and j < len(pred_end_time_list):
        pred_time_list.append(pred_start_time_list[i])
        pred_time_list.append(pred_end_time_list[j])
        i += 1
        j += 1
    if i < len(pred_start_time_list):
        pred_time_list.append(pred_start_time_list[-1])
    for i in range(1, len(pred_time_list)):
        # 这里给个 600ms 的宽限
        if pred_time_list[i] < pred_time_list[i - 1] - 0.6:
            logger.error("识别的 start、end 不符合 start0 < end0 < start1 < end1 < start2 < end2 ...")
            logger.error(
                f"当前识别的每个句子开始和结束时间分别为: \
                开始时间：{pred_start_time_list}, \
                结束时间：{pred_end_time_list}"
            )
            start_end_count += 1
            # change_product_available()
    # 时间前后差值 300ms 范围内
    start_time_align_count = 0
    end_time_align_count = 0
    for label_start_time in label_start_time_list:
        for pred_start_time in pred_start_time_list:
            if pred_start_time <= label_start_time + 0.3 and pred_start_time >= label_start_time - 0.3:
                start_time_align_count += 1
                break
    for label_end_time in label_end_time_list:
        for pred_end_time in pred_end_time_list:
            if pred_end_time <= label_end_time + 0.3 and pred_end_time >= label_end_time - 0.3:
                end_time_align_count += 1
                break
    logger.info(
        f"start-time 对齐个数 {start_time_align_count}, \
        end-time 对齐个数 {end_time_align_count}\
        数据集中句子总数 {len(label_start_time_list)}"
    )
    return start_time_align_count, end_time_align_count, start_end_count
 def first_delay(context: ASRContext) -> Tuple:
    first_send_time = context.preds[0].send_time
    first_delay_list = []
    sentence_start = True
    for pred_context in context.preds:
        if sentence_start:
            sentence_begin_time = pred_context.recognition_results.start_time
            first_delay_time = pred_context.recv_time - first_send_time - sentence_begin_time
            first_delay_list.append(first_delay_time)
        sentence_start = pred_context.recognition_results.final_result
    if IN_TEST:
        print(f"当前音频的首字延迟为{first_delay_list}")
    logger.info(f"当前音频的首字延迟均值为 {np.mean(first_delay_list)}s")
    return np.sum(first_delay_list), len(first_delay_list)
 def revision_delay(context: ASRContext):
    first_send_time = context.preds[0].send_time
    revision_delay_list = []
    for pred_context in context.preds:
        if pred_context.recognition_results.final_result:
            sentence_end_time = pred_context.recognition_results.end_time
            revision_delay_time = pred_context.recv_time - first_send_time - sentence_end_time
            revision_delay_list.append(revision_delay_time)
    if IN_TEST:
        print(revision_delay_list)
    logger.info(f"当前音频的修正延迟均值为 {np.mean(revision_delay_list)}s")
    return np.sum(revision_delay_list), len(revision_delay_list)
 def patch_unique_token_count(context: ASRContext):
    # print(context.__dict__)
    # 对于每一个返回的结果都进行 tokenize
    pred_text_list = [pred_context.recognition_results.text for pred_context in context.preds]
    pred_text_tokenized_list = Tokenizer.norm_and_tokenize(pred_text_list, lang=context.lang)
    # print(pred_text_list)
    # print(pred_text_tokenized_list)
    # 判断当前是否修改了超过 3s 内的 token 数目
    ## 当前句子的最开始接受时间
    first_recv_time = None
    ## 不可修改的 token 个数
    unmodified_token_cnt = 0
    ## 3s 的 index 位置
    time_token_idx = 0
    ## 当前是句子的开始
    final_sentence = True
    ## 修改了不可修改的范围
    is_unmodified_token = False
    for idx, (now_tokens, pred_context) in enumerate(zip(pred_text_tokenized_list, context.preds)):
        ## 当前是句子的第一次返回
        if final_sentence:
            first_recv_time = pred_context.recv_time
            unmodified_token_cnt = 0
            time_token_idx = idx
            final_sentence = pred_context.recognition_results.final_result
            continue
        final_sentence = pred_context.recognition_results.final_result
        ## 当前 pred 的 recv-time
        pred_recv_time = pred_context.recv_time
        ## 最开始 3s 直接忽略
        if pred_recv_time - first_recv_time < 3:
            continue
        ## 根据历史返回信息，获得最长不可修改长度
        while time_token_idx < idx:
            context_pred_tmp = context.preds[time_token_idx]
            context_pred_tmp_recv_time = context_pred_tmp.recv_time
            tmp_tokens = pred_text_tokenized_list[time_token_idx]
            if pred_recv_time - context_pred_tmp_recv_time >= 3:
                unmodified_token_cnt = max(unmodified_token_cnt, len(tmp_tokens))
                time_token_idx += 1
            else:
                break
        ## 和自己的上一条音频比，只能修改 unmodified_token_cnt 个 token
        last_tokens = pred_text_tokenized_list[idx - 1]
        if context.lang in ['ar', 'he']:
            tokens_check_pre, tokens_check_now = last_tokens[::-1], now_tokens[::-1]
            continue
        else:
            tokens_check_pre, tokens_check_now = last_tokens, now_tokens
        for token_a, token_b in zip(tokens_check_pre[:unmodified_token_cnt], tokens_check_now[:unmodified_token_cnt]):
            if token_a != token_b:
                is_unmodified_token = True
                break
        if is_unmodified_token and int(os.getenv('test', 0)):
            logger.error(
                f"{idx}-{unmodified_token_cnt}-{last_tokens[:unmodified_token_cnt]}-{now_tokens[:unmodified_token_cnt]}"
            )
        if is_unmodified_token:
            break
    if is_unmodified_token:
        logger.error("修改了不可修改的文字范围")
        # change_product_available()
        if int(os.getenv('test', 0)):
            final_result = True
            result_list = []
            for tokens, pred in zip(pred_text_tokenized_list, context.preds):
                if final_result:
                    result_list.append([])
                result_list[-1].append((tokens, pred.recv_time - context.preds[0].recv_time))
                final_result = pred.recognition_results.final_result
            for item in result_list:
                logger.info(str(item))
    # 记录每个 patch 的 token 个数
    patch_unique_cnt_counter = Counter()
    patch_unique_cnt_in_one_sentence = set()
    for pred_text_tokenized, pred_context in zip(pred_text_tokenized_list, context.preds):
        token_cnt = len(pred_text_tokenized)
        patch_unique_cnt_in_one_sentence.add(token_cnt)
        if pred_context.recognition_results.final_result:
            for unique_cnt in patch_unique_cnt_in_one_sentence:
                patch_unique_cnt_counter[unique_cnt] += 1
            patch_unique_cnt_in_one_sentence.clear()
    if context.preds and not context.preds[-1].recognition_results.final_result:
        for unique_cnt in patch_unique_cnt_in_one_sentence:
            patch_unique_cnt_counter[unique_cnt] += 1
    # print(patch_unique_cnt_counter)
    logger.info(
        f"当前音频的 patch token 均值为 {mean_on_counter(patch_unique_cnt_counter)}, \
        当前音频的 patch token 方差为 {var_on_counter(patch_unique_cnt_counter)}"
    )
    return patch_unique_cnt_counter
 def mean_on_counter(counter: Counter):
    total_sum = sum(key * count for key, count in counter.items())
    total_count = sum(counter.values())
    return total_sum * 1.0 / total_count
 def var_on_counter(counter: Counter):
    total_sum = sum(key * count for key, count in counter.items())
    total_count = sum(counter.values())
    mean = total_sum * 1.0 / total_count
    return sum((key - mean) ** 2 * count for key, count in counter.items()) / total_count
 def edit_distance(arr1: List, arr2: List):
    operations = Levenshtein.editops(arr1, arr2)
    i = sum([1 for operation in operations if operation[0] == "insert"])
    s = sum([1 for operation in operations if operation[0] == "replace"])
    d = sum([1 for operation in operations if operation[0] == "delete"])
    c = len(arr1) - s - d
    return s, d, i, c
 def cer(tokens_gt_mapping: List[str], tokens_dt_mapping: List[str]):
    """输入的是经过编辑距离映射后的两个 token 序列，返回 1-cer, token-cnt"""
    insert = sum(1 for item in tokens_gt_mapping if item is None)
    delete = sum(1 for item in tokens_dt_mapping if item is None)
    equal = sum(1 for token_gt, token_dt in zip(tokens_gt_mapping, tokens_dt_mapping) if token_gt == token_dt)
    replace = len(tokens_gt_mapping) - insert - equal
    token_count = replace + equal + delete
    cer_value = (replace + delete + insert) * 1.0 / token_count
    logger.info(f"当前音频的 cer/wer 值为 {cer_value}, token 个数为 {token_count}")
    return 1 - cer_value, token_count
 def cut_rate(
    tokens_gt: List[List[str]],
    tokens_dt: List[List[str]],
    tokens_gt_mapping: List[str],
    tokens_dt_mapping: List[str],
 ):
    sentence_final_token_index_gt = sentence_final_token_index(tokens_gt, tokens_gt_mapping)
    sentence_final_token_index_dt = sentence_final_token_index(tokens_dt, tokens_dt_mapping)
    sentence_final_token_index_gt = set(sentence_final_token_index_gt)
    sentence_final_token_index_dt = set(sentence_final_token_index_dt)
    sentence_count_gt = len(sentence_final_token_index_gt)
    miss_count = len(sentence_final_token_index_gt - sentence_final_token_index_dt)
    more_count = len(sentence_final_token_index_dt - sentence_final_token_index_gt)
    rate = max(1 - (miss_count + more_count * 2) / sentence_count_gt, 0)
    return rate, sentence_count_gt, miss_count, more_count
 def token_mapping(tokens_gt: List[str], tokens_dt: List[str]) -> Tuple[List[str], List[str]]:
    arr1 = deepcopy(tokens_gt)
    arr2 = deepcopy(tokens_dt)
    operations = Levenshtein.editops(arr1, arr2)
    for op in operations[::-1]:
        if op[0] == "insert":
            arr1.insert(op[1], None)
        elif op[0] == "delete":
            arr2.insert(op[2], None)
    return arr1, arr2
 def sentence_final_token_index(tokens: List[List[str]], tokens_mapping: List[str]) -> List[int]:
    """获得原句子中每个句子尾部 token 的 index"""
    token_index_list = []
    token_index = 0
    for token_in_one_sentence in tokens:
        for _ in range(len(token_in_one_sentence)):
            while token_index < len(tokens_mapping) and tokens_mapping[token_index] is None:
                token_index += 1
            token_index += 1
        token_index_list.append(token_index - 1)
    return token_index_list
 def cut_sentence(sentences: List[str], tokenizerType: TokenizerType) -> List[str]:
    """use self.cut_punc to cut all sentences, merge them and put them into list"""
    sentence_cut_list = []
    for sentence in sentences:
        sentence_list = [sentence]
        sentence_tmp_list = []
        for punc in [
            "······",
            "......",
            "。",
            "，",
            "？",
            "！",
            "；",
            "：",
            "...",
            ".",
            ",",
            "?",
            "!",
            ";",
            ":",
        ]:
            for sentence in sentence_list:
                sentence_tmp_list.extend(sentence.split(punc))
            sentence_list, sentence_tmp_list = sentence_tmp_list, []
        sentence_list = [item for item in sentence_list if item]
        if tokenizerType == TokenizerType.whitespace:
            sentence_cut_list.append(" ".join(sentence_list))
        else:
            sentence_cut_list.append("".join(sentence_list))
    return sentence_cut_list
--- a/utils/metrics_plus.py
+++ b/utils/metrics_plus.py
@@ -0,0 +1,50 @@
 from typing import List
 from utils.tokenizer import TokenizerType
 def replace_general_punc(
    sentences: List[str], tokenizer: TokenizerType
 ) -> List[str]:
    """代替原来的函数 utils.metrics.cut_sentence"""
    general_puncs = [
        "······",
        "......",
        "。",
        "，",
        "？",
        "！",
        "；",
        "：",
        "...",
        ".",
        ",",
        "?",
        "!",
        ";",
        ":",
    ]
    if tokenizer == TokenizerType.whitespace:
        replacer = " "
    else:
        replacer = ""
    trans = str.maketrans(dict.fromkeys("".join(general_puncs), replacer))
    ret_sentences = [""] * len(sentences)
    for i, sentence in enumerate(sentences):
        sentence = sentence.translate(trans)
        sentence = sentence.strip()
        sentence = sentence.lower()
        ret_sentences[i] = sentence
    return ret_sentences
 def distance_point_line(
    point: float, line_start: float, line_end: float
 ) -> float:
    """计算点到直线的距离"""
    if line_start <= point <= line_end:
        return 0
    if point < line_start:
        return abs(point - line_start)
    else:
        return abs(point - line_end)
--- a/utils/pynini/Dockerfile
+++ b/utils/pynini/Dockerfile
@@ -0,0 +1,93 @@
 # Dockerfile
 # Pierre-André Noël, May 12th 2020
 # Copyright © Element AI Inc. All rights reserved.
 # Apache License, Version 2.0
 #
 # This builds `manylinux_2_28_x86_64` Python wheels for `pynini`, wrapping
 # all its dependencies.
 #
 # This Dockerfile uses multi-stage builds; for more information, see:
 # https://docs.docker.com/develop/develop-images/multistage-build/
 # 
 # The recommended installation method for Pynini is through Conda-Forge. This gives Linux
 # x86-64 users another option: installing a precompiled module from PyPI.
 # 
 # 
 # To build wheels and run Pynini's tests, run:
 # 
 #     docker build --target=run-tests -t build-pynini-wheels .
 # 
 # To extract the resulting wheels from the Docker image, run:
 #
 #     docker run --rm -v `pwd`:/io build-pynini-wheels cp -r /wheelhouse /io
 #
 # Notice that this also generates Cython wheels.
 # 
 # Then, `twine` (https://twine.readthedocs.io/en/latest/) can be used to
 # publish the resulting Pynini wheels.
 # ******************************************************
 # *** All the following images are based on this one ***
 # ******************************************************
 #from quay.io/pypa/manylinux_2_28_x86_64 AS common
 # ***********************************************************************
 # *** Image providing all the requirements for building Pynini wheels ***
 # ***********************************************************************
 FROM harbor.4pd.io/inf/base-python3.8-ubuntu:1.1.0
 # The versions we want in the wheels.
 ENV FST_VERSION "1.8.3"
 ENV PYNINI_VERSION "2.1.6"
 # Location of OpenFst and Pynini.
 ENV FST_DOWNLOAD_PREFIX "https://www.openfst.org/twiki/pub/FST/FstDownload"
 ENV PYNINI_DOWNLOAD_PREFIX "https://www.opengrm.org/twiki/pub/GRM/PyniniDownload"
 # Note that our certificates are not known to the version of wget available in this image.
 # Gets and unpack OpenFst source.
 RUN apt update && apt-get install -y wget gcc-9 g++-9 make && ln -s $(which gcc-9) /usr/bin/gcc && ln -s $(which g++-9) /usr/bin/g++
 RUN cd /tmp \
    && wget -q --no-check-certificate "${FST_DOWNLOAD_PREFIX}/openfst-${FST_VERSION}.tar.gz" \
    && tar -xzf "openfst-${FST_VERSION}.tar.gz" \
    && rm "openfst-${FST_VERSION}.tar.gz"
 # Compiles OpenFst.
 RUN cd "/tmp/openfst-${FST_VERSION}" \
    && ./configure --enable-grm \
    && make --jobs 4 install \
    && rm -rd "/tmp/openfst-${FST_VERSION}"
 # Gets and unpacks Pynini source.
 RUN mkdir -p /src && cd /src \
    && wget -q --no-check-certificate "${PYNINI_DOWNLOAD_PREFIX}/pynini-${PYNINI_VERSION}.tar.gz" \
    && tar -xzf "pynini-${PYNINI_VERSION}.tar.gz" \
    && rm "pynini-${PYNINI_VERSION}.tar.gz"
 # Installs requirements in all our Pythons.
 RUN pip install -i https://nexus.4pd.io/repository/pypi-all/simple -r "/src/pynini-${PYNINI_VERSION}/requirements.txt" || exit; 
 # **********************************************************
 # *** Image making pynini wheels (placed in /wheelhouse) ***
 # **********************************************************
 #FROM wheel-building-env AS build-wheels
 # Compiles the wheels to a temporary directory.
 RUN pip wheel -i https://nexus.4pd.io/repository/pypi-all/simple -v "/src/pynini-${PYNINI_VERSION}" -w /tmp/wheelhouse/ || exit; 
 RUN wget ftp://ftp.4pd.io/pub/pico/temp/patchelf-0.18.0-x86_64.tar.gz && tar xzf patchelf-0.18.0-x86_64.tar.gz && rm -f patchelf-0.18.0-x86_64.tar.gz
 RUN pip install -i https://nexus.4pd.io/repository/pypi-all/simple auditwheel
 # Bundles external shared libraries into the wheels.
 # See https://github.com/pypa/manylinux/tree/manylinux2014
 RUN for WHL in /tmp/wheelhouse/pynini*.whl; do \
    PATH=$(pwd)/bin:$PATH auditwheel repair --plat manylinux_2_31_x86_64 "${WHL}" -w /wheelhouse/ || exit; \
 done
 #RUN mkdir -p /wheelhouse && for WHL in /tmp/wheelhouse/pynini*.whl; do \
 #    cp "${WHL}" /wheelhouse/; \
 #done
 # Removes the non-repaired wheels.
 RUN rm -rd /tmp/wheelhouse
--- a/utils/pynini/README.md
+++ b/utils/pynini/README.md
@@ -0,0 +1,17 @@
 # pynini
 ## 背景
 SpeechIO对英文ASR的评估工具依赖第三方库pynini（https://github.com/kylebgorman/pynini），该库强绑定OS和gcc版本，需要在运行环境中编译生成wheel包，本文说明编译pynini生成wheel包的方法
 ## 编译
 ```shell
 docker build -t build-pynini-wheels .
 ```
 ## 获取wheel包
 ```shell
 docker run --rm -v `pwd`:/io build-pynini-wheels cp -r /wheelhouse /io
 ```
--- a/utils/request.py
+++ b/utils/request.py
@@ -0,0 +1,40 @@
 import requests
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 DEFAULT_TIMEOUT = 2 * 60  # seconds
 class TimeoutHTTPAdapter(HTTPAdapter):
    def __init__(self, *args, **kwargs):
        self.timeout = DEFAULT_TIMEOUT
        if "timeout" in kwargs:
            self.timeout = kwargs["timeout"]
            del kwargs["timeout"]
        super().__init__(*args, **kwargs)
    def send(self, request, **kwargs):
        timeout = kwargs.get("timeout")
        if timeout is None:
            kwargs["timeout"] = self.timeout
        return super().send(request, **kwargs)
 def requests_retry_session(
    retries=3,
    backoff_factor=1,
    status_forcelist=[500, 502, 504, 404, 403],
    session=None,
 ):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = TimeoutHTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session
--- a/utils/service.py
+++ b/utils/service.py
@@ -0,0 +1,65 @@
 # -*- coding: utf-8 -*-
 import os
 import sys
 from utils.helm import deploy_chart, gen_chart_tarball
 from utils.logger import logger
 def register_sut(st_config, resource_name, **kwargs):
    job_id = "".join([c for c in str(os.getenv("JOB_ID", -1)) if c.isnumeric()])
    docker_image = "10.255.143.18:5000/speaker_identification:wo_model_v0"
    #if "docker_image" in st_config and st_config["docker_image"]:
    st_config_values = st_config.get("values", {})
    #docker_image = st_config["docker_image"]
    docker_image = "10.255.143.18:5000/speaker_identification:wo_model_v0"
    chart_tar_fp, chart_values = gen_chart_tarball(docker_image)
    sut_service_name, _ = deploy_chart(
        resource_name,
        int(os.getenv("readiness_timeout", 60 * 3)),
        chart_fileobj=chart_tar_fp,
        extra_values=st_config_values,
        restart_count_limit=int(os.getenv('restart_count', 3)),
    )
    chart_tar_fp.close()
    if st_config_values is not None and "service" in st_config_values and "port" in st_config_values["service"]:
        sut_service_port = str(st_config_values["service"]["port"])
    else:
        sut_service_port = str(chart_values["service"]["port"])
    return "ws://{}:{}".format(sut_service_name, sut_service_port)
    """
    elif "chart_repo" in st_config:
        logger.info(f"正在使用 helm-chart 配置，内容为 {st_config}")
        chart_repo = st_config.get("chart_repo", None)
        chart_name = st_config.get("chart_name", None)
        chart_version = st_config.get("chart_version", None)
        if chart_repo is None or chart_name is None or chart_version is None:
            logger.error("chart_repo, chart_name, chart_version cant be none")
        logger.info(f"{chart_repo} {chart_name} {chart_version}")
        chart_str = os.path.join(chart_repo, chart_name) + ':' + chart_version
        st_cfg_values = st_config.get('values', {})
        st_config["values"] = st_cfg_values
        sut_service_name, _ = deploy_chart(
            resource_name,
            600,
            chart_str=chart_str,
            extra_values=st_cfg_values,
        )
        sut_service_name = f"asr-{job_id}"
        if st_cfg_values is not None and 'service' in st_cfg_values and 'port' in st_cfg_values['service']:
            sut_service_port = str(st_cfg_values['service']['port'])
        else:
            sut_service_port = '80'
        return 'ws://%s:%s' % (sut_service_name, sut_service_port)
    else:
        logger.error("配置信息错误，缺少 docker_image 属性")
        #sys.exit(-1)
    """
--- a/utils/speechio/init.py
+++ b/utils/speechio/init.py
@@ -0,0 +1,3 @@
 '''
 reference: https://github.com/SpeechColab/Leaderboard/tree/f287a992dc359d1c021bfc6ce810e5e36608e057/utils
 '''
--- a/utils/speechio/error_rate_en.py
+++ b/utils/speechio/error_rate_en.py
@@ -0,0 +1,551 @@
 #!/usr/bin/env python3
 # coding=utf8
 # Copyright  2022  Zhenxiang MA, Jiayu DU (SpeechColab)
 import argparse
 import csv
 import json
 import logging
 import os
 import sys
 from typing import Iterable
 logging.basicConfig(stream=sys.stderr, level=logging.ERROR, format='[%(levelname)s] %(message)s')
 import pynini
 from pynini.lib import pynutil
 # reference: https://github.com/kylebgorman/pynini/blob/master/pynini/lib/edit_transducer.py
 # to import original lib:
 #     from pynini.lib.edit_transducer import EditTransducer
 class EditTransducer:
    DELETE = "<delete>"
    INSERT = "<insert>"
    SUBSTITUTE = "<substitute>"
    def __init__(
        self,
        symbol_table,
        vocab: Iterable[str],
        insert_cost: float = 1.0,
        delete_cost: float = 1.0,
        substitute_cost: float = 1.0,
        bound: int = 0,
    ):
        # Left factor; note that we divide the edit costs by two because they also
        # will be incurred when traversing the right factor.
        sigma = pynini.union(
            *[pynini.accep(token, token_type=symbol_table) for token in vocab],
        ).optimize()
        insert = pynutil.insert(f"[{self.INSERT}]", weight=insert_cost / 2)
        delete = pynini.cross(sigma, pynini.accep(f"[{self.DELETE}]", weight=delete_cost / 2))
        substitute = pynini.cross(sigma, pynini.accep(f"[{self.SUBSTITUTE}]", weight=substitute_cost / 2))
        edit = pynini.union(insert, delete, substitute).optimize()
        if bound:
            sigma_star = pynini.closure(sigma)
            self._e_i = sigma_star.copy()
            for _ in range(bound):
                self._e_i.concat(edit.ques).concat(sigma_star)
        else:
            self._e_i = edit.union(sigma).closure()
        self._e_i.optimize()
        right_factor_std = EditTransducer._right_factor(self._e_i)
        # right_factor_ext allows 0-cost matching between token's raw form & auxiliary form
        # e.g.: 'I' -> 'I#', 'AM' -> 'AM#'
        right_factor_ext = (
            pynini.union(
                *[
                    pynini.cross(
                        pynini.accep(x, token_type=symbol_table),
                        pynini.accep(x + '#', token_type=symbol_table),
                    )
                    for x in vocab
                ]
            )
            .optimize()
            .closure()
        )
        self._e_o = pynini.union(right_factor_std, right_factor_ext).closure().optimize()
    @staticmethod
    def _right_factor(ifst: pynini.Fst) -> pynini.Fst:
        ofst = pynini.invert(ifst)
        syms = pynini.generated_symbols()
        insert_label = syms.find(EditTransducer.INSERT)
        delete_label = syms.find(EditTransducer.DELETE)
        pairs = [(insert_label, delete_label), (delete_label, insert_label)]
        right_factor = ofst.relabel_pairs(ipairs=pairs)
        return right_factor
    def create_lattice(self, iexpr: pynini.FstLike, oexpr: pynini.FstLike) -> pynini.Fst:
        lattice = (iexpr @ self._e_i) @ (self._e_o @ oexpr)
        EditTransducer.check_wellformed_lattice(lattice)
        return lattice
    @staticmethod
    def check_wellformed_lattice(lattice: pynini.Fst) -> None:
        if lattice.start() == pynini.NO_STATE_ID:
            raise RuntimeError("Edit distance composition lattice is empty.")
    def compute_distance(self, iexpr: pynini.FstLike, oexpr: pynini.FstLike) -> float:
        lattice = self.create_lattice(iexpr, oexpr)
        # The shortest cost from all final states to the start state is
        # equivalent to the cost of the shortest path.
        start = lattice.start()
        return float(pynini.shortestdistance(lattice, reverse=True)[start])
    def compute_alignment(self, iexpr: pynini.FstLike, oexpr: pynini.FstLike) -> pynini.FstLike:
        print(iexpr)
        print(oexpr)
        lattice = self.create_lattice(iexpr, oexpr)
        alignment = pynini.shortestpath(lattice, nshortest=1, unique=True)
        return alignment.optimize()
 class ErrorStats:
    def __init__(self):
        self.num_ref_utts = 0
        self.num_hyp_utts = 0
        self.num_eval_utts = 0  # in both ref & hyp
        self.num_hyp_without_ref = 0
        self.C = 0
        self.S = 0
        self.I = 0
        self.D = 0
        self.token_error_rate = 0.0
        self.modified_token_error_rate = 0.0
        self.num_utts_with_error = 0
        self.sentence_error_rate = 0.0
    def to_json(self):
        # return json.dumps(self.__dict__, indent=4)
        return json.dumps(self.__dict__)
    def to_kaldi(self):
        info = (
            F'%WER {self.token_error_rate:.2f} [ {self.S + self.D + self.I} / {self.C + self.S + self.D}, {self.I} ins, {self.D} del, {self.S} sub ]\n'
            F'%SER {self.sentence_error_rate:.2f} [ {self.num_utts_with_error} / {self.num_eval_utts} ]\n'
        )
        return info
    def to_summary(self):
        summary = (
            '==================== Overall Statistics ====================\n'
            F'num_ref_utts: {self.num_ref_utts}\n'
            F'num_hyp_utts: {self.num_hyp_utts}\n'
            F'num_hyp_without_ref: {self.num_hyp_without_ref}\n'
            F'num_eval_utts: {self.num_eval_utts}\n'
            F'sentence_error_rate: {self.sentence_error_rate:.2f}%\n'
            F'token_error_rate: {self.token_error_rate:.2f}%\n'
            F'modified_token_error_rate: {self.modified_token_error_rate:.2f}%\n'
            F'token_stats:\n'
            F'  - tokens:{self.C + self.S + self.D:>7}\n'
            F'  - edits: {self.S + self.I + self.D:>7}\n'
            F'  - cor:   {self.C:>7}\n'
            F'  - sub:   {self.S:>7}\n'
            F'  - ins:   {self.I:>7}\n'
            F'  - del:   {self.D:>7}\n'
            '============================================================\n'
        )
        return summary
 class Utterance:
    def __init__(self, uid, text):
        self.uid = uid
        self.text = text
 def LoadKaldiArc(filepath):
    utts = {}
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if line:
                cols = line.split(maxsplit=1)
                assert len(cols) == 2 or len(cols) == 1
                uid = cols[0]
                text = cols[1] if len(cols) == 2 else ''
                if utts.get(uid) != None:
                    raise RuntimeError(F'Found duplicated utterence id {uid}')
                utts[uid] = Utterance(uid, text)
    return utts
 def BreakHyphen(token: str):
    # 'T-SHIRT' should also introduce new words into vocabulary, e.g.:
    #   1. 'T' & 'SHIRT'
    #   2. 'TSHIRT'
    assert '-' in token
    v = token.split('-')
    v.append(token.replace('-', ''))
    return v
 def LoadGLM(rel_path):
    '''
    glm.csv:
        I'VE,I HAVE
        GOING TO,GONNA
        ...
        T-SHIRT,T SHIRT,TSHIRT
    glm:
        {
            '<RULE_00000>': ["I'VE", 'I HAVE'],
            '<RULE_00001>': ['GOING TO', 'GONNA'],
            ...
            '<RULE_99999>': ['T-SHIRT', 'T SHIRT', 'TSHIRT'],
        }
    '''
    logging.info(f'Loading GLM from {rel_path} ...')
    abs_path = os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
    reader = list(csv.reader(open(abs_path, encoding="utf-8"), delimiter=','))
    glm = {}
    for k, rule in enumerate(reader):
        rule_name = f'<RULE_{k:06d}>'
        glm[rule_name] = [phrase.strip() for phrase in rule]
    logging.info(f'  #rule: {len(glm)}')
    return glm
 def SymbolEQ(symbol_table, i1, i2):
    return symbol_table.find(i1).strip('#') == symbol_table.find(i2).strip('#')
 def PrintSymbolTable(symbol_table: pynini.SymbolTable):
    print('SYMBOL_TABLE:')
    for k in range(symbol_table.num_symbols()):
        sym = symbol_table.find(k)
        assert symbol_table.find(sym) == k  # symbol table's find can be used for bi-directional lookup (id <-> sym)
        print(k, sym)
    print()
 def BuildSymbolTable(vocab) -> pynini.SymbolTable:
    logging.info('Building symbol table ...')
    symbol_table = pynini.SymbolTable()
    symbol_table.add_symbol('<epsilon>')
    for w in vocab:
        symbol_table.add_symbol(w)
    logging.info(f'  #symbols: {symbol_table.num_symbols()}')
    # PrintSymbolTable(symbol_table)
    # symbol_table.write_text('symbol_table.txt')
    return symbol_table
 def BuildGLMTagger(glm, symbol_table) -> pynini.Fst:
    logging.info('Building GLM tagger ...')
    rule_taggers = []
    for rule_tag, rule in glm.items():
        for phrase in rule:
            rule_taggers.append(
                (
                    pynutil.insert(pynini.accep(rule_tag, token_type=symbol_table))
                    + pynini.accep(phrase, token_type=symbol_table)
                    + pynutil.insert(pynini.accep(rule_tag, token_type=symbol_table))
                )
            )
    alphabet = pynini.union(
        *[pynini.accep(sym, token_type=symbol_table) for k, sym in symbol_table if k != 0]  # non-epsilon
    ).optimize()
    tagger = pynini.cdrewrite(
        pynini.union(*rule_taggers).optimize(), '', '', alphabet.closure()
    ).optimize()  # could be slow with large vocabulary
    return tagger
 def TokenWidth(token: str):
    def CharWidth(c):
        return 2 if (c >= '\u4e00') and (c <= '\u9fa5') else 1
    return sum([CharWidth(c) for c in token])
 def PrintPrettyAlignment(raw_hyp, edit_ali, ref_ali, hyp_ali, stream=sys.stderr):
    assert len(edit_ali) == len(ref_ali) and len(ref_ali) == len(hyp_ali)
    H = '  HYP# : '
    R = '  REF  : '
    E = '  EDIT : '
    for i, e in enumerate(edit_ali):
        h, r = hyp_ali[i], ref_ali[i]
        e = '' if e == 'C' else e  # don't bother printing correct edit-tag
        nr, nh, ne = TokenWidth(r), TokenWidth(h), TokenWidth(e)
        n = max(nr, nh, ne) + 1
        H += h + ' ' * (n - nh)
        R += r + ' ' * (n - nr)
        E += e + ' ' * (n - ne)
    print(F'  HYP  : {raw_hyp}', file=stream)
    print(H, file=stream)
    print(R, file=stream)
    print(E, file=stream)
 def ComputeTokenErrorRate(c, s, i, d):
    assert (s + d + c) != 0
    num_edits = s + d + i
    ref_len = c + s + d
    hyp_len = c + s + i
    return 100.0 * num_edits / ref_len, 100.0 * num_edits / max(ref_len, hyp_len)
 def ComputeSentenceErrorRate(num_err_utts, num_utts):
    assert num_utts != 0
    return 100.0 * num_err_utts / num_utts
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--logk', type=int, default=500, help='logging interval')
    parser.add_argument(
        '--tokenizer', choices=['whitespace', 'char'], default='whitespace', help='whitespace for WER, char for CER'
    )
    parser.add_argument('--glm', type=str, default='glm_en.csv', help='glm')
    parser.add_argument('--ref', type=str, required=True, help='reference kaldi arc file')
    parser.add_argument('--hyp', type=str, required=True, help='hypothesis kaldi arc file')
    parser.add_argument('result_file', type=str)
    args = parser.parse_args()
    logging.info(args)
    stats = ErrorStats()
    logging.info('Generating tokenizer ...')
    if args.tokenizer == 'whitespace':
        def word_tokenizer(text):
            return text.strip().split()
        tokenizer = word_tokenizer
    elif args.tokenizer == 'char':
        def char_tokenizer(text):
            return [c for c in text.strip().replace(' ', '')]
        tokenizer = char_tokenizer
    else:
        tokenizer = None
    assert tokenizer
    logging.info('Loading REF & HYP ...')
    ref_utts = LoadKaldiArc(args.ref)
    hyp_utts = LoadKaldiArc(args.hyp)
    # check valid utterances in hyp that have matched non-empty reference
    uids = []
    for uid in sorted(hyp_utts.keys()):
        if uid in ref_utts.keys():
            if ref_utts[uid].text.strip():  # non-empty reference
                uids.append(uid)
            else:
                logging.warning(F'Found {uid} with empty reference, skipping...')
        else:
            logging.warning(F'Found {uid} without reference, skipping...')
            stats.num_hyp_without_ref += 1
    stats.num_hyp_utts = len(hyp_utts)
    stats.num_ref_utts = len(ref_utts)
    stats.num_eval_utts = len(uids)
    logging.info(f'  #hyp:{stats.num_hyp_utts}, #ref:{stats.num_ref_utts}, #utts_to_evaluate:{stats.num_eval_utts}')
    print(f'  #hyp:{stats.num_hyp_utts}, #ref:{stats.num_ref_utts}, #utts_to_evaluate:{stats.num_eval_utts}')
    tokens = []
    for uid in uids:
        ref_tokens = tokenizer(ref_utts[uid].text)
        hyp_tokens = tokenizer(hyp_utts[uid].text)
        for t in ref_tokens + hyp_tokens:
            tokens.append(t)
            if '-' in t:
                tokens.extend(BreakHyphen(t))
    vocab_from_utts = list(set(tokens))
    logging.info(f'  HYP&REF vocab size: {len(vocab_from_utts)}')
    print(f'  HYP&REF vocab size: {len(vocab_from_utts)}')
    assert args.glm
    glm = LoadGLM(args.glm)
    tokens = []
    for rule in glm.values():
        for phrase in rule:
            for t in tokenizer(phrase):
                tokens.append(t)
                if '-' in t:
                    tokens.extend(BreakHyphen(t))
    vocab_from_glm = list(set(tokens))
    logging.info(f'  GLM vocab size: {len(vocab_from_glm)}')
    print(f'  GLM vocab size: {len(vocab_from_glm)}')
    vocab = list(set(vocab_from_utts + vocab_from_glm))
    logging.info(f'Global vocab size: {len(vocab)}')
    print(f'Global vocab size: {len(vocab)}')
    symtab = BuildSymbolTable(
        # Normal evaluation vocab + auxiliary form for alternative paths + GLM tags
        vocab
        + [x + '#' for x in vocab]
        + [x for x in glm.keys()]
    )
    glm_tagger = BuildGLMTagger(glm, symtab)
    edit_transducer = EditTransducer(symbol_table=symtab, vocab=vocab)
    print(edit_transducer)
    logging.info('Evaluating error rate ...')
    print('Evaluating error rate ...')
    fo = open(args.result_file, 'w+', encoding='utf8')
    ndone = 0
    for uid in uids:
        ref = ref_utts[uid].text
        raw_hyp = hyp_utts[uid].text
        ref_fst = pynini.accep(' '.join(tokenizer(ref)), token_type=symtab)
        print(ref_fst)
        # print(ref_fst.string(token_type = symtab))
        raw_hyp_fst = pynini.accep(' '.join(tokenizer(raw_hyp)), token_type=symtab)
        # print(raw_hyp_fst.string(token_type = symtab))
        # Say, we have:
        #   RULE_001: "I'M" <-> "I AM"
        #   REF: HEY I AM HERE
        #   HYP: HEY I'M HERE
        #
        # We want to expand HYP with GLM rules(marked with auxiliary #)
        #   HYP#: HEY {I'M | I# AM#} HERE
        # REF is honored to keep its original form.
        #
        # This could be considered as a flexible on-the-fly TN towards HYP.
        # 1. GLM rule tagging:
        #   HEY I'M HERE
        # ->
        #   HEY <RULE_001> I'M <RULE_001> HERE
        lattice = (raw_hyp_fst @ glm_tagger).optimize()
        tagged_ir = pynini.shortestpath(lattice, nshortest=1, unique=True).string(token_type=symtab)
        # print(hyp_tagged)
        # 2. GLM rule expansion:
        #   HEY <RULE_001> I'M <RULE_001> HERE
        # ->
        #   sausage-like fst: HEY {I'M | I# AM#} HERE
        tokens = tagged_ir.split()
        sausage = pynini.accep('', token_type=symtab)
        i = 0
        while i < len(tokens):  # invariant: tokens[0, i) has been built into fst
            forms = []
            if tokens[i].startswith('<RULE_') and tokens[i].endswith('>'):  # rule segment
                rule_name = tokens[i]
                rule = glm[rule_name]
                # pre-condition: i -> ltag
                raw_form = ''
                for j in range(i + 1, len(tokens)):
                    if tokens[j] == rule_name:
                        raw_form = ' '.join(tokens[i + 1 : j])
                        break
                assert raw_form
                # post-condition: i -> ltag, j -> rtag
                forms.append(raw_form)
                for phrase in rule:
                    if phrase != raw_form:
                        forms.append(' '.join([x + '#' for x in phrase.split()]))
                i = j + 1
            else:  # normal token segment
                token = tokens[i]
                forms.append(token)
                if "-" in token:  # token with hyphen yields extra forms
                    forms.append(' '.join([x + '#' for x in token.split('-')]))  # 'T-SHIRT' -> 'T# SHIRT#'
                    forms.append(token.replace('-', '') + '#')  # 'T-SHIRT' -> 'TSHIRT#'
                i += 1
            sausage_segment = pynini.union(*[pynini.accep(x, token_type=symtab) for x in forms]).optimize()
            sausage += sausage_segment
        hyp_fst = sausage.optimize()
        print(hyp_fst)
        # Utterance-Level error rate evaluation
        alignment = edit_transducer.compute_alignment(ref_fst, hyp_fst)
        print("alignment", alignment)
        distance = 0.0
        C, S, I, D = 0, 0, 0, 0  # Cor, Sub, Ins, Del
        edit_ali, ref_ali, hyp_ali = [], [], []
        for state in alignment.states():
            for arc in alignment.arcs(state):
                i, o = arc.ilabel, arc.olabel
                if i != 0 and o != 0 and SymbolEQ(symtab, i, o):
                    e = 'C'
                    r, h = symtab.find(i), symtab.find(o)
                    C += 1
                    distance += 0.0
                elif i != 0 and o != 0 and not SymbolEQ(symtab, i, o):
                    e = 'S'
                    r, h = symtab.find(i), symtab.find(o)
                    S += 1
                    distance += 1.0
                elif i == 0 and o != 0:
                    e = 'I'
                    r, h = '*', symtab.find(o)
                    I += 1
                    distance += 1.0
                elif i != 0 and o == 0:
                    e = 'D'
                    r, h = symtab.find(i), '*'
                    D += 1
                    distance += 1.0
                else:
                    raise RuntimeError
                edit_ali.append(e)
                ref_ali.append(r)
                hyp_ali.append(h)
        # assert(distance == edit_transducer.compute_distance(ref_fst, sausage))
        utt_ter, utt_mter = ComputeTokenErrorRate(C, S, I, D)
        # print(F'{{"uid":{uid}, "score":{-distance}, "TER":{utt_ter:.2f}, "mTER":{utt_mter:.2f}, "cor":{C}, "sub":{S}, "ins":{I}, "del":{D}}}', file=fo)
        # PrintPrettyAlignment(raw_hyp, edit_ali, ref_ali, hyp_ali, fo)
        if utt_ter > 0:
            stats.num_utts_with_error += 1
        stats.C += C
        stats.S += S
        stats.I += I
        stats.D += D
        ndone += 1
        if ndone % args.logk == 0:
            logging.info(f'{ndone} utts evaluated.')
    logging.info(f'{ndone} utts evaluated in total.')
    # Corpus-Level evaluation
    stats.token_error_rate, stats.modified_token_error_rate = ComputeTokenErrorRate(stats.C, stats.S, stats.I, stats.D)
    stats.sentence_error_rate = ComputeSentenceErrorRate(stats.num_utts_with_error, stats.num_eval_utts)
    print(stats.to_json(), file=fo)
    # print(stats.to_kaldi())
    # print(stats.to_summary(), file=fo)
    fo.close()
--- a/utils/speechio/error_rate_zh.py
+++ b/utils/speechio/error_rate_zh.py
@@ -0,0 +1,370 @@
 #!/usr/bin/env python3
 # coding=utf8
 # Copyright  2021  Jiayu DU
 import sys
 import argparse
 import json
 import logging
 logging.basicConfig(stream=sys.stderr, level=logging.INFO, format='[%(levelname)s] %(message)s')
 DEBUG = None
 def GetEditType(ref_token, hyp_token):
    if ref_token == None and hyp_token != None:
        return 'I'
    elif ref_token != None and hyp_token == None:
        return 'D'
    elif ref_token == hyp_token:
        return 'C'
    elif ref_token != hyp_token:
        return 'S'
    else:
        raise RuntimeError
 class AlignmentArc:
    def __init__(self, src, dst, ref, hyp):
        self.src = src
        self.dst = dst
        self.ref = ref
        self.hyp = hyp
        self.edit_type = GetEditType(ref, hyp)
 def similarity_score_function(ref_token, hyp_token):
    return 0 if (ref_token == hyp_token) else -1.0
 def insertion_score_function(token):
    return -1.0
 def deletion_score_function(token):
    return -1.0
 def EditDistance(
        ref,
        hyp, 
        similarity_score_function = similarity_score_function,
        insertion_score_function = insertion_score_function,
        deletion_score_function = deletion_score_function):
    assert(len(ref) != 0)
    class DPState:
        def __init__(self):
            self.score = -float('inf')
            # backpointer
            self.prev_r = None
            self.prev_h = None
    def print_search_grid(S, R, H, fstream):
        print(file=fstream)
        for r in range(R):
            for h in range(H):
                print(F'[{r},{h}]:{S[r][h].score:4.3f}:({S[r][h].prev_r},{S[r][h].prev_h}) ', end='', file=fstream)
            print(file=fstream)
    R = len(ref) + 1
    H = len(hyp) + 1
    # Construct DP search space, a (R x H) grid
    S = [ [] for r in range(R) ]
    for r in range(R):
        S[r] = [ DPState() for x in range(H) ]
    # initialize DP search grid origin, S(r = 0, h = 0)
    S[0][0].score = 0.0
    S[0][0].prev_r = None
    S[0][0].prev_h = None
    # initialize REF axis
    for r in range(1, R):
        S[r][0].score = S[r-1][0].score + deletion_score_function(ref[r-1])
        S[r][0].prev_r = r-1
        S[r][0].prev_h = 0
    # initialize HYP axis
    for h in range(1, H):
        S[0][h].score = S[0][h-1].score + insertion_score_function(hyp[h-1])
        S[0][h].prev_r = 0
        S[0][h].prev_h = h-1
    best_score = S[0][0].score
    best_state = (0, 0)
    for r in range(1, R):
        for h in range(1, H):
            sub_or_cor_score = similarity_score_function(ref[r-1], hyp[h-1])
            new_score = S[r-1][h-1].score + sub_or_cor_score
            if new_score >= S[r][h].score:
                S[r][h].score = new_score
                S[r][h].prev_r = r-1
                S[r][h].prev_h = h-1
            del_score = deletion_score_function(ref[r-1])
            new_score = S[r-1][h].score + del_score
            if new_score >= S[r][h].score:
                S[r][h].score = new_score
                S[r][h].prev_r = r - 1
                S[r][h].prev_h = h
            ins_score = insertion_score_function(hyp[h-1])
            new_score = S[r][h-1].score + ins_score
            if new_score >= S[r][h].score:
                S[r][h].score = new_score
                S[r][h].prev_r = r
                S[r][h].prev_h = h-1
    best_score = S[R-1][H-1].score
    best_state = (R-1, H-1)
    if DEBUG:
        print_search_grid(S, R, H, sys.stderr)
    # Backtracing best alignment path, i.e. a list of arcs
    # arc = (src, dst, ref, hyp, edit_type)
    # src/dst = (r, h), where r/h refers to search grid state-id along Ref/Hyp axis
    best_path = []
    r, h = best_state[0], best_state[1]
    prev_r, prev_h = S[r][h].prev_r, S[r][h].prev_h
    score = S[r][h].score
    # loop invariant:
    #   1. (prev_r, prev_h) -> (r, h) is a "forward arc" on best alignment path
    #   2. score is the value of point(r, h) on DP search grid
    while prev_r != None or prev_h != None:
        src = (prev_r, prev_h)
        dst = (r, h)
        if (r == prev_r + 1 and h == prev_h + 1): # Substitution or correct
            arc = AlignmentArc(src, dst, ref[prev_r], hyp[prev_h])
        elif (r == prev_r + 1 and h == prev_h): # Deletion
            arc = AlignmentArc(src, dst, ref[prev_r], None)
        elif (r == prev_r and h == prev_h + 1): # Insertion
            arc = AlignmentArc(src, dst, None, hyp[prev_h])
        else:
            raise RuntimeError
        best_path.append(arc)
        r, h = prev_r, prev_h
        prev_r, prev_h = S[r][h].prev_r, S[r][h].prev_h
        score = S[r][h].score
    best_path.reverse()
    return (best_path, best_score)
 def PrettyPrintAlignment(alignment, stream = sys.stderr):
    def get_token_str(token):
        if token == None:
            return "*"
        return token
    def is_double_width_char(ch):
        if (ch >= '\u4e00') and (ch <= '\u9fa5'): # codepoint ranges for Chinese chars
            return True
        # TODO: support other double-width-char language such as Japanese, Korean 
        else:
            return False
    def display_width(token_str):
        m = 0
        for c in token_str:
            if is_double_width_char(c):
                m += 2
            else:
                m += 1
        return m
    R = '  REF  : '
    H = '  HYP  : '
    E = '  EDIT : '
    for arc in alignment:
        r = get_token_str(arc.ref)
        h = get_token_str(arc.hyp)
        e = arc.edit_type if arc.edit_type != 'C' else ''
        nr, nh, ne = display_width(r), display_width(h), display_width(e)
        n = max(nr, nh, ne) + 1
        R += r + ' ' * (n-nr)
        H += h + ' ' * (n-nh)
        E += e + ' ' * (n-ne)
    print(R, file=stream)
    print(H, file=stream)
    print(E, file=stream)
 def CountEdits(alignment):
    c, s, i, d = 0, 0, 0, 0
    for arc in alignment:
        if arc.edit_type == 'C':
            c += 1
        elif arc.edit_type == 'S':
            s += 1
        elif arc.edit_type == 'I':
            i += 1
        elif arc.edit_type == 'D':
            d += 1
        else:
            raise RuntimeError
    return (c, s, i, d)
 def ComputeTokenErrorRate(c, s, i, d):
    return 100.0 * (s + d + i) / (s + d + c)
 def ComputeSentenceErrorRate(num_err_utts, num_utts):
    assert(num_utts != 0)
    return 100.0 * num_err_utts / num_utts
 class EvaluationResult:
    def __init__(self):
        self.num_ref_utts = 0
        self.num_hyp_utts = 0
        self.num_eval_utts = 0 # seen in both ref & hyp
        self.num_hyp_without_ref = 0
        self.C = 0
        self.S = 0
        self.I = 0
        self.D = 0
        self.token_error_rate = 0.0
        self.num_utts_with_error = 0
        self.sentence_error_rate = 0.0
    def to_json(self):
        return json.dumps(self.__dict__)
    def to_kaldi(self):
        info = (
            F'%WER {self.token_error_rate:.2f} [ {self.S + self.D + self.I} / {self.C + self.S + self.D}, {self.I} ins, {self.D} del, {self.S} sub ]\n'
            F'%SER {self.sentence_error_rate:.2f} [ {self.num_utts_with_error} / {self.num_eval_utts} ]\n'
        )
        return info
    def to_sclite(self):
        return "TODO"
    def to_espnet(self):
        return "TODO"
    def to_summary(self):
        #return json.dumps(self.__dict__, indent=4)
        summary = (
            '==================== Overall Statistics ====================\n'
            F'num_ref_utts: {self.num_ref_utts}\n'
            F'num_hyp_utts: {self.num_hyp_utts}\n'
            F'num_hyp_without_ref: {self.num_hyp_without_ref}\n'
            F'num_eval_utts: {self.num_eval_utts}\n'
            F'sentence_error_rate: {self.sentence_error_rate:.2f}%\n'
            F'token_error_rate: {self.token_error_rate:.2f}%\n'
            F'token_stats:\n'
            F'  - tokens:{self.C + self.S + self.D:>7}\n'
            F'  - edits: {self.S + self.I + self.D:>7}\n'
            F'  - cor:   {self.C:>7}\n'
            F'  - sub:   {self.S:>7}\n'
            F'  - ins:   {self.I:>7}\n'
            F'  - del:   {self.D:>7}\n'
            '============================================================\n'
        )
        return summary
 class Utterance:
    def __init__(self, uid, text):
        self.uid = uid
        self.text = text
 def LoadUtterances(filepath, format):
    utts = {}
    if format == 'text': # utt_id word1 word2 ...
        with open(filepath, 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line:
                    cols = line.split(maxsplit=1)
                    assert(len(cols) == 2 or len(cols) == 1)
                    uid = cols[0]
                    text = cols[1] if len(cols) == 2 else ''
                    if utts.get(uid) != None:
                        raise RuntimeError(F'Found duplicated utterence id {uid}')
                    utts[uid] = Utterance(uid, text)
    else:
        raise RuntimeError(F'Unsupported text format {format}')
    return utts
 def tokenize_text(text, tokenizer):
    if tokenizer == 'whitespace':
        return text.split()
    elif tokenizer == 'char':
        return [ ch for ch in ''.join(text.split()) ]
    else:
        raise RuntimeError(F'ERROR: Unsupported tokenizer {tokenizer}')
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # optional
    parser.add_argument('--tokenizer', choices=['whitespace', 'char'], default='whitespace', help='whitespace for WER, char for CER')
    parser.add_argument('--ref-format', choices=['text'], default='text', help='reference format, first col is utt_id, the rest is text')
    parser.add_argument('--hyp-format', choices=['text'], default='text', help='hypothesis format, first col is utt_id, the rest is text')
    # required
    parser.add_argument('--ref', type=str, required=True, help='input reference file')
    parser.add_argument('--hyp', type=str, required=True, help='input hypothesis file')
    parser.add_argument('result_file', type=str)
    args = parser.parse_args()
    logging.info(args)
    ref_utts = LoadUtterances(args.ref, args.ref_format)
    hyp_utts = LoadUtterances(args.hyp, args.hyp_format)
    r = EvaluationResult()
    # check valid utterances in hyp that have matched non-empty reference
    eval_utts = []
    r.num_hyp_without_ref = 0
    for uid in sorted(hyp_utts.keys()):
        if uid in ref_utts.keys(): # TODO: efficiency
            if ref_utts[uid].text.strip(): # non-empty reference
                eval_utts.append(uid)
            else:
                logging.warn(F'Found {uid} with empty reference, skipping...')
        else:
            logging.warn(F'Found {uid} without reference, skipping...')
            r.num_hyp_without_ref += 1
    r.num_hyp_utts = len(hyp_utts)
    r.num_ref_utts = len(ref_utts)
    r.num_eval_utts = len(eval_utts)
    with open(args.result_file, 'w+', encoding='utf8') as fo:
        for uid in eval_utts:
            ref = ref_utts[uid]
            hyp = hyp_utts[uid]
            alignment, score = EditDistance(
                tokenize_text(ref.text, args.tokenizer),
                tokenize_text(hyp.text, args.tokenizer)
            )
            c, s, i, d = CountEdits(alignment)
            utt_ter = ComputeTokenErrorRate(c, s, i, d)
            # utt-level evaluation result
            print(F'{{"uid":{uid}, "score":{score}, "ter":{utt_ter:.2f}, "cor":{c}, "sub":{s}, "ins":{i}, "del":{d}}}', file=fo)
            PrettyPrintAlignment(alignment, fo)
            r.C += c
            r.S += s
            r.I += i
            r.D += d
            if utt_ter > 0:
                r.num_utts_with_error += 1
        # corpus level evaluation result
        r.sentence_error_rate = ComputeSentenceErrorRate(r.num_utts_with_error, r.num_eval_utts)
        r.token_error_rate = ComputeTokenErrorRate(r.C, r.S, r.I, r.D)
        print(r.to_summary(), file=fo)
    print(r.to_json())
    print(r.to_kaldi())
--- a/utils/speechio/glm_en.csv
+++ b/utils/speechio/glm_en.csv
@@ -0,0 +1,744 @@
 I'M,I AM
 I'LL,I WILL
 I'D,I HAD
 I'VE,I HAVE
 I WOULD'VE,I'D HAVE
 YOU'RE,YOU ARE
 YOU'LL,YOU WILL
 YOU'D,YOU WOULD
 YOU'VE,YOU HAVE
 HE'S,HE IS,HE WAS
 HE'LL,HE WILL
 HE'D,HE HAD
 SHE'S,SHE IS,SHE WAS
 SHE'LL,SHE WILL
 SHE'D,SHE HAD
 IT'S,IT IS,IT WAS
 IT'LL,IT WILL
 WE'RE,WE ARE,WE WERE
 WE'LL,WE WILL
 WE'D,WE WOULD
 WE'VE,WE HAVE
 WHO'LL,WHO WILL
 THEY'RE,THEY ARE
 THEY'LL,THEY WILL
 THAT'S,THAT IS,THAT WAS
 THAT'LL,THAT WILL
 HERE'S,HERE IS,HERE WAS
 THERE'S,THERE IS,THERE WAS
 WHERE'S,WHERE IS,WHERE WAS
 WHAT'S,WHAT IS,WHAT WAS
 LET'S,LET US
 WHO'S,WHO IS
 ONE'S,ONE IS
 THERE'LL,THERE WILL
 SOMEBODY'S,SOMEBODY IS
 EVERYBODY'S,EVERYBODY IS
 WOULD'VE,WOULD HAVE
 CAN'T,CANNOT,CAN NOT
 HADN'T,HAD NOT
 HASN'T,HAS NOT
 HAVEN'T,HAVE NOT
 ISN'T,IS NOT
 AREN'T,ARE NOT
 WON'T,WILL NOT
 WOULDN'T,WOULD NOT
 SHOULDN'T,SHOULD NOT
 DON'T,DO NOT
 DIDN'T,DID NOT
 GOTTA,GOT TO
 GONNA,GOING TO
 WANNA,WANT TO
 LEMME,LET ME
 GIMME,GIVE ME
 DUNNO,DON'T KNOW
 GOTCHA,GOT YOU
 KINDA,KIND OF
 MYSELF,MY SELF
 YOURSELF,YOUR SELF
 HIMSELF,HIM SELF
 HERSELF,HER SELF
 ITSELF,IT SELF
 OURSELVES,OUR SELVES
 OKAY,OK,O K
 Y'ALL,YALL,YOU ALL
 'CAUSE,'COS,CUZ,BECAUSE
 FUCKIN',FUCKING
 KILLING,KILLIN'
 EVERYDAY,EVERY DAY
 DOCTOR,DR,DR.
 MRS,MISSES,MISSUS
 MR,MR.,MISTER
 SR,SR.,SENIOR
 JR,JR.,JUNIOR
 ST,ST.,SAINT
 VOL,VOL.,VOLUME
 CM,CENTIMETER,CENTIMETRE
 MM,MILLIMETER,MILLIMETRE
 KM,KILOMETER,KILOMETRE
 KB,KILOBYTES,KILO BYTES,K B
 MB,MEGABYTES,MEGA BYTES
 GB,GIGABYTES,GIGA BYTES,G B
 THOUSAND,THOUSAND AND
 HUNDRED,HUNDRED AND
 A HUNDRED,ONE HUNDRED
 TWO THOUSAND AND,TWENTY,TWO THOUSAND
 STORYTELLER,STORY TELLER
 TSHIRT,T SHIRT
 TSHIRTS,T SHIRTS
 LEUKAEMIA,LEUKEMIA
 OESTROGEN,ESTROGEN
 ACKNOWLEDGMENT,ACKNOWLEDGEMENT
 JUDGMENT,JUDGEMENT
 MAMMA,MAMA
 DINING,DINNING
 FLACK,FLAK
 LEARNT,LEARNED
 BLONDE,BLOND
 JUMPSTART,JUMP START
 RIGHTNOW,RIGHT NOW
 EVERYONE,EVERY ONE
 NAME'S,NAME IS
 FAMILY'S,FAMILY IS
 COMPANY'S,COMPANY HAS
 GRANDKID,GRAND KID
 GRANDKIDS,GRAND KIDS
 MEALTIMES,MEAL TIMES
 ALRIGHT,ALL RIGHT
 GROWNUP,GROWN UP
 GROWNUPS,GROWN UPS
 SCHOOLDAYS,SCHOOL DAYS
 SCHOOLCHILDREN,SCHOOL CHILDREN
 CASEBOOK,CASE BOOK
 HUNGOVER,HUNG OVER
 HANDCLAPS,HAND CLAPS
 HANDCLAP,HAND CLAP
 HEATWAVE,HEAT WAVE
 ADDON,ADD ON
 ONTO,ON TO
 INTO,IN TO
 GOTO,GO TO
 GUNSHOT,GUN SHOT
 MOTHERFUCKER,MOTHER FUCKER
 OFTENTIMES,OFTEN TIMES
 SARTRE'S,SARTRE IS
 NONSTARTER,NON STARTER
 NONSTARTERS,NON STARTERS
 LONGTIME,LONG TIME
 POLICYMAKERS,POLICY MAKERS
 ANYMORE,ANY MORE
 CANADA'S,CANADA IS
 CELLPHONE,CELL PHONE
 WORKPLACE,WORK PLACE
 UNDERESTIMATING,UNDER ESTIMATING
 CYBERSECURITY,CYBER SECURITY
 NORTHEAST,NORTH EAST
 ANYTIME,ANY TIME
 LIVESTREAM,LIVE STREAM
 LIVESTREAMS,LIVE STREAMS
 WEBCAM,WEB CAM
 EMAIL,E MAIL
 ECAM,E CAM
 VMIX,V MIX
 SETUP,SET UP
 SMARTPHONE,SMART PHONE
 MULTICASTING,MULTI CASTING
 CHITCHAT,CHIT CHAT
 SEMIFINAL,SEMI FINAL
 SEMIFINALS,SEMI FINALS
 BBQ,BARBECUE
 STORYLINE,STORY LINE
 STORYLINES,STORY LINES
 BRO,BROTHER
 BROS,BROTHERS
 OVERPROTECTIIVE,OVER PROTECTIVE
 TIMEOUT,TIME OUT
 ADVISOR,ADVISER
 TIMBERWOLVES,TIMBER WOLVES
 WEBPAGE,WEB PAGE
 NEWCOMER,NEW COMER
 DELMAR,DEL MAR
 NETPLAY,NET PLAY
 STREETSIDE,STREET SIDE
 COLOURED,COLORED
 COLOURFUL,COLORFUL
 O,ZERO
 ETCETERA,ET CETERA
 FUNDRAISING,FUND RAISING
 RAINFOREST,RAIN FOREST
 BREATHTAKING,BREATH TAKING
 WIKIPAGE,WIKI PAGE
 OVERTIME,OVER TIME
 TRAIN'S TRAIN IS
 ANYONE,ANY ONE
 PHYSIOTHERAPY,PHYSIO THERAPY
 ANYBODY,ANY BODY
 BOTTLECAPS,BOTTLE CAPS
 BOTTLECAP,BOTTLE CAP
 STEPFATHER'S,STEP FATHER'S
 STEPFATHER,STEP FATHER
 WARTIME,WAR TIME
 SCREENSHOT,SCREEN SHOT
 TIMELINE,TIME LINE
 CITY'S,CITY IS
 NONPROFIT,NON PROFIT
 KPOP,K POP
 HOMEBASE,HOME BASE
 LIFELONG,LIFE LONG
 LAWSUITS,LAW SUITS
 MULTIBILLION,MULTI BILLION
 ROADMAP,ROAD MAP
 GUY'S,GUY IS
 CHECKOUT,CHECK OUT
 SQUARESPACE,SQUARE SPACE
 REDLINING,RED LINING
 BASE'S,BASE IS
 TAKEAWAY,TAKE AWAY
 CANDYLAND,CANDY LAND
 ANTISOCIAL,ANTI SOCIAL
 CASEWORK,CASE WORK
 RIGOR,RIGOUR
 ORGANIZATIONS,ORGANISATIONS
 ORGANIZATION,ORGANISATION
 SIGNPOST,SIGN POST
 WWII,WORLD WAR TWO
 WINDOWPANE,WINDOW PANE
 SUREFIRE,SURE FIRE
 MOUNTAINTOP,MOUNTAIN TOP
 SALESPERSON,SALES PERSON
 NETWORK,NET WORK
 MINISERIES,MINI SERIES
 EDWARDS'S,EDWARDS IS
 INTERSUBJECTIVITY,INTER SUBJECTIVITY
 LIBERALISM'S,LIBERALISM IS
 TAGLINE,TAG LINE
 SHINETHEORY,SHINE THEORY
 CALLYOURGIRLFRIEND,CALL YOUR GIRLFRIEND
 STARTUP,START UP
 BREAKUP,BREAK UP
 RADIOTOPIA,RADIO TOPIA
 HEARTBREAKING,HEART BREAKING
 AUTOIMMUNE,AUTO IMMUNE
 SINISE'S,SINISE IS
 KICKBACK,KICK BACK
 FOGHORN,FOG HORN
 BADASS,BAD ASS
 POWERAMERICAFORWARD,POWER AMERICA FORWARD
 GOOGLE'S,GOOGLE IS
 ROLEPLAY,ROLE PLAY
 PRICE'S,PRICE IS
 STANDOFF,STAND OFF
 FOREVER,FOR EVER
 GENERAL'S,GENERAL IS
 DOG'S,DOG IS
 AUDIOBOOK,AUDIO BOOK
 ANYWAY,ANY WAY
 PIGEONHOLE,PIEGON HOLE
 EGGSHELLS,EGG SHELLS
 VACCINE'S,VACCINE IS
 WORKOUT,WORK OUT
 ADMINISTRATOR'S,ADMINISTRATOR IS
 FUCKUP,FUCK UP
 RUNOFFS,RUN OFFS
 COLORWAY,COLOR WAY
 WAITLIST,WAIT LIST
 HEALTHCARE,HEALTH CARE
 TEXTBOOK,TEXT BOOK
 CALLBACK,CALL BACK
 PARTYGOERS,PARTY GOERS
 SOMEDAY,SOME DAY
 NIGHTGOWN,NIGHT GOWN
 STANDALONG,STAND ALONG
 BUSSINESSWOMAN,BUSSINESS WOMAN
 STORYTELLING,STORY TELLING
 MARKETPLACE,MARKET PLACE
 CRATEJOY,CRATE JOY
 OUTPERFORMED,OUT PERFORMED
 TRUEBOTANICALS,TRUE BOTANICALS
 NONFICTION,NON FICTION
 SPINOFF,SPIN OFF
 MOTHERFUCKING,MOTHER FUCKING
 TRACKLIST,TRACK LIST
 GODDAMN,GOD DAMN
 PORNHUB,PORN HUB
 UNDERAGE,UNDER AGE
 GOODBYE,GOOD BYE
 HARDCORE,HARD CORE
 TRUCK'S,TRUCK IS
 COUNTERSTEERING,COUNTER STEERING
 BUZZWORD,BUZZ WORD
 SUBCOMPONENTS,SUB COMPONENTS
 MOREOVER,MORE OVER
 PICKUP,PICK UP
 NEWSLETTER,NEWS LETTER
 KEYWORD,KEY WORD
 LOGIN,LOG IN
 TOOLBOX,TOOL BOX
 LINK'S,LINK IS
 PRIMIALVIDEO,PRIMAL VIDEO
 DOTNET,DOT NET
 AIRSTRIKE,AIR STRIKE
 HAIRSTYLE,HAIR STYLE
 TOWNSFOLK,TOWNS FOLK
 GOLDFISH,GOLD FISH
 TOM'S,TOM IS
 HOMETOWN,HOME TOWN
 CORONAVIRUS,CORONA VIRUS
 PLAYSTATION,PLAY STATION
 TOMORROW,TO MORROW
 TIMECONSUMING,TIME CONSUMING
 POSTWAR,POST WAR
 HANDSON,HANDS ON
 SHAKEUP,SHAKE UP
 ECOMERS,E COMERS
 COFOUNDER,CO FOUNDER
 HIGHEND,HIGH END
 INPERSON,IN PERSON
 GROWNUP,GROWN UP
 SELFREGULATION,SELF REGULATION
 INDEPTH,IN DEPTH
 ALLTIME,ALL TIME
 LONGTERM,LONG TERM
 SOCALLED,SO CALLED
 SELFCONFIDENCE,SELF CONFIDENCE
 STANDUP,STAND UP
 MINDBOGGLING,MIND BOGGLING
 BEINGFOROTHERS,BEING FOR OTHERS
 COWROTE,CO WROTE
 COSTARRED,CO STARRED
 EDITORINCHIEF,EDITOR IN CHIEF
 HIGHSPEED,HIGH SPEED
 DECISIONMAKING,DECISION MAKING
 WELLBEING,WELL BEING
 NONTRIVIAL,NON TRIVIAL
 PREEXISTING,PRE EXISTING
 STATEOWNED,STATE OWNED
 PLUGIN,PLUG IN
 PROVERSION,PRO VERSION
 OPTIN,OPT IN
 FOLLOWUP,FOLLOW UP
 FOLLOWUPS,FOLLOW UPS
 WIFI,WI FI
 THIRDPARTY,THIRD PARTY
 PROFESSIONALLOOKING,PROFESSIONAL LOOKING
 FULLSCREEN,FULL SCREEN
 BUILTIN,BUILT IN
 MULTISTREAM,MULTI STREAM
 LOWCOST,LOW COST
 RESTREAM,RE STREAM
 GAMECHANGER,GAME CHANGER
 WELLDEVELOPED,WELL DEVELOPED
 QUARTERINCH,QUARTER INCH
 FASTFASHION,FAST FASHION
 ECOMMERCE,E COMMERCE
 PRIZEWINNING,PRIZE WINNING
 NEVERENDING,NEVER ENDING
 MINDBLOWING,MIND BLOWING
 REALLIFE,REAL LIFE
 REOPEN,RE OPEN
 ONDEMAND,ON DEMAND
 PROBLEMSOLVING,PROBLEM SOLVING
 HEAVYHANDED,HEAVY HANDED
 OPENENDED,OPEN ENDED
 SELFCONTROL,SELF CONTROL
 WELLMEANING,WELL MEANING
 COHOST,CO HOST
 RIGHTSBASED,RIGHTS BASED
 HALFBROTHER,HALF BROTHER
 FATHERINLAW,FATHER IN LAW
 COAUTHOR,CO AUTHOR
 REELECTION,RE ELECTION
 SELFHELP,SELF HELP
 PROLIFE,PRO LIFE
 ANTIDUKE,ANTI DUKE
 POSTSTRUCTURALIST,POST STRUCTURALIST
 COFOUNDED,CO FOUNDED
 XRAY,X RAY
 ALLAROUND,ALL AROUND
 HIGHTECH,HIGH TECH
 TMOBILE,T MOBILE
 INHOUSE,IN HOUSE
 POSTMORTEM,POST MORTEM
 LITTLEKNOWN,LITTLE KNOWN
 FALSEPOSITIVE,FALSE POSITIVE
 ANTIVAXXER,ANTI VAXXER
 EMAILS,E MAILS
 DRIVETHROUGH,DRIVE THROUGH
 DAYTODAY,DAY TO DAY
 COSTAR,CO STAR
 EBAY,E BAY
 KOOLAID,KOOL AID
 ANTIDEMOCRATIC,ANTI DEMOCRATIC
 MIDDLEAGED,MIDDLE AGED
 SHORTLIVED,SHORT LIVED
 BESTSELLING,BEST SELLING
 TICTACS,TIC TACS
 UHHUH,UH HUH
 MULTITANK,MULTI TANK
 JAWDROPPING,JAW DROPPING
 LIVESTREAMING,LIVE STREAMING
 HARDWORKING,HARD WORKING
 BOTTOMDWELLING,BOTTOM DWELLING
 PRESHOW,PRE SHOW
 HANDSFREE,HANDS FREE
 TRICKORTREATING,TRICK OR TREATING
 PRERECORDED,PRE RECORDED
 DOGOODERS,DO GOODERS
 WIDERANGING,WIDE RANGING
 LIFESAVING,LIFE SAVING
 SKIREPORT,SKI REPORT
 SNOWBASE,SNOW BASE
 JAYZ,JAY Z
 SPIDERMAN,SPIDER MAN
 FREEKICK,FREE KICK
 EDWARDSHELAIRE,EDWARDS HELAIRE
 SHORTTERM,SHORT TERM
 HAVENOTS,HAVE NOTS
 SELFINTEREST,SELF INTEREST
 SELFINTERESTED,SELF INTERESTED
 SELFCOMPASSION,SELF COMPASSION
 MACHINELEARNING,MACHINE LEARNING
 COAUTHORED,CO AUTHORED
 NONGOVERNMENT,NON GOVERNMENT
 SUBSAHARAN,SUB SAHARAN
 COCHAIR,CO CHAIR
 LARGESCALE,LARGE SCALE
 VIDEOONDEMAND,VIDEO ON DEMAND
 FIRSTCLASS,FIRST CLASS
 COFOUNDERS,CO FOUNDERS
 COOP,CO OP
 PREORDERS,PRE ORDERS
 DOUBLEENTRY,DOUBLE ENTRY
 SELFCONFIDENT,SELF CONFIDENT
 SELFPORTRAIT,SELF PORTRAIT
 NONWHITE,NON WHITE
 ONBOARD,ON BOARD
 HALFLIFE,HALF LIFE
 ONCOURT,ON COURT
 SCIFI,SCI FI
 XMEN,X MEN
 DAYLEWIS,DAY LEWIS
 LALALAND,LA LA LAND
 AWARDWINNING,AWARD WINNING
 BOXOFFICE,BOX OFFICE
 TRIDACTYLS,TRI DACTYLS
 TRIDACTYL,TRI DACTYL
 MEDIUMSIZED,MEDIUM SIZED
 POSTSECONDARY,POST SECONDARY
 FULLTIME,FULL TIME
 GOKART,GO KART
 OPENAIR,OPEN AIR
 WELLKNOWN,WELL KNOWN
 ICECREAM,ICE CREAM
 EARTHMOON,EARTH MOON
 STATEOFTHEART,STATE OF THE ART
 BSIDE,B SIDE
 EASTWEST,EAST WEST
 ALLSTAR,ALL STAR
 RUNNERUP,RUNNER UP
 HORSEDRAWN,HORSE DRAWN
 OPENSOURCE,OPEN SOURCE
 PURPOSEBUILT,PURPOSE BUILT
 SQUAREFREE,SQUARE FREE
 PRESENTDAY,PRESENT DAY
 CANADAUNITED,CANADA UNITED
 HOTCHPOTCH,HOTCH POTCH
 LOWLYING,LOW LYING
 RIGHTHANDED,RIGHT HANDED
 PEARSHAPED,PEAR SHAPED
 BESTKNOWN,BEST KNOWN
 FULLLENGTH,FULL LENGTH
 YEARROUND,YEAR ROUND
 PREELECTION,PRE ELECTION
 RERECORD,RE RECORD
 MINIALBUM,MINI ALBUM
 LONGESTRUNNING,LONGEST RUNNING
 ALLIRELAND,ALL IRELAND
 NORTHWESTERN,NORTH WESTERN
 PARTTIME,PART TIME
 NONGOVERNMENTAL,NON GOVERNMENTAL
 ONLINE,ON LINE
 ONAIR,ON AIR
 NORTHSOUTH,NORTH SOUTH
 RERELEASED,RE RELEASED
 LEFTHANDED,LEFT HANDED
 BSIDES,B SIDES
 ANGLOSAXON,ANGLO SAXON
 SOUTHSOUTHEAST,SOUTH SOUTHEAST
 CROSSCOUNTRY,CROSS COUNTRY
 REBUILT,RE BUILT
 FREEFORM,FREE FORM
 SCOOBYDOO,SCOOBY DOO
 ATLARGE,AT LARGE
 COUNCILMANAGER,COUNCIL MANAGER
 LONGRUNNING,LONG RUNNING
 PREWAR,PRE WAR
 REELECTED,RE ELECTED
 HIGHSCHOOL,HIGH SCHOOL
 RUNNERSUP,RUNNERS UP
 NORTHWEST,NORTH WEST
 WEBBASED,WEB BASED
 HIGHQUALITY,HIGH QUALITY
 RIGHTWING,RIGHT WING
 LANEFOX,LANE FOX
 PAYPERVIEW,PAY PER VIEW
 COPRODUCTION,CO PRODUCTION
 NONPARTISAN,NON PARTISAN
 FIRSTPERSON,FIRST PERSON
 WORLDRENOWNED,WORLD RENOWNED
 VICEPRESIDENT,VICE PRESIDENT
 PROROMAN,PRO ROMAN
 COPRODUCED,CO PRODUCED
 LOWPOWER,LOW POWER
 SELFESTEEM,SELF ESTEEM
 SEMITRANSPARENT,SEMI TRANSPARENT
 SECONDINCOMMAND,SECOND IN COMMAND
 HIGHRISE,HIGH RISE
 COHOSTED,CO HOSTED
 AFRICANAMERICAN,AFRICAN AMERICAN
 SOUTHWEST,SOUTH WEST
 WELLPRESERVED,WELL PRESERVED
 FEATURELENGTH,FEATURE LENGTH
 HIPHOP,HIP HOP
 ALLBIG,ALL BIG
 SOUTHEAST,SOUTH EAST
 COUNTERATTACK,COUNTER ATTACK
 QUARTERFINALS,QUARTER FINALS
 STABLEDOOR,STABLE DOOR
 DARKEYED,DARK EYED
 ALLAMERICAN,ALL AMERICAN
 THIRDPERSON,THIRD PERSON
 LOWLEVEL,LOW LEVEL
 NTERMINAL,N TERMINAL
 DRIEDUP,DRIED UP
 AFRICANAMERICANS,AFRICAN AMERICANS
 ANTIAPARTHEID,ANTI APARTHEID
 STOKEONTRENT,STOKE ON TRENT
 NORTHNORTHEAST,NORTH NORTHEAST
 BRANDNEW,BRAND NEW
 RIGHTANGLED,RIGHT ANGLED
 GOVERNMENTOWNED,GOVERNMENT OWNED
 SONINLAW,SON IN LAW
 SUBJECTOBJECTVERB,SUBJECT OBJECT VERB
 LEFTARM,LEFT ARM
 LONGLIVED,LONG LIVED
 REDEYE,RED EYE
 TPOSE,T POSE
 NIGHTVISION,NIGHT VISION
 SOUTHEASTERN,SOUTH EASTERN
 WELLRECEIVED,WELL RECEIVED
 ALFAYOUM,AL FAYOUM
 TIMEBASED,TIME BASED
 KETTLEDRUMS,KETTLE DRUMS
 BRIGHTEYED,BRIGHT EYED
 REDBROWN,RED BROWN
 SAMESEX,SAME SEX
 PORTDEPAIX,PORT DE PAIX
 CLEANUP,CLEAN UP
 PERCENT,PERCENT SIGN
 TAKEOUT,TAKE OUT
 KNOWHOW,KNOW HOW
 FISHBONE,FISH BONE
 FISHSTICKS,FISH STICKS
 PAPERWORK,PAPER WORK
 NICKNACKS,NICK NACKS
 STREETTALKING,STREET TALKING
 NONACADEMIC,NON ACADEMIC
 SHELLY,SHELLEY
 SHELLY'S,SHELLEY'S
 JIMMY,JIMMIE
 JIMMY'S,JIMMIE'S
 DRUGSTORE,DRUG STORE
 THRU,THROUGH
 PLAYDATE,PLAY DATE
 MICROLIFE,MICRO LIFE
 SKILLSET,SKILL SET
 SKILLSETS,SKILL SETS
 TRADEOFF,TRADE OFF
 TRADEOFFS,TRADE OFFS
 ONSCREEN,ON SCREEN
 PLAYBACK,PLAY BACK
 ARTWORK,ART WORK
 COWORKER,CO WORDER
 COWORKERS,CO WORDERS
 SOMETIME,SOME TIME
 SOMETIMES,SOME TIMES
 CROWDFUNDING,CROWD FUNDING
 AM,A.M.,A M
 PM,P.M.,P M
 TV,T V
 MBA,M B A
 USA,U S A
 US,U S
 UK,U K
 CEO,C E O
 CFO,C F O
 COO,C O O
 CIO,C I O
 FM,F M
 GMC,G M C
 FSC,F S C
 NPD,N P D
 APM,A P M
 NGO,N G O
 TD,T D
 LOL,L O L
 IPO,I P O
 CNBC,C N B C
 IPOS,I P OS
 CNBC's,C N B C'S
 JT,J T
 NPR,N P R
 NPR'S,N P R'S
 MP,M P
 IOI,I O I
 DW,D W
 CNN,C N N
 WSM,W S M
 ET,E T
 IT,I T
 RJ,R J
 DVD,D V D
 DVD'S,D V D'S
 HBO,H B O
 LA,L A
 XC,X C
 SUV,S U V
 NBA,N B A
 NBA'S,N B A'S
 ESPN,E S P N
 ESPN'S,E S P N'S
 ADT,A D T
 HD,H D
 VIP,V I P
 TMZ,T M Z
 CBC,C B C
 NPO,N P O
 BBC,B B C
 LA'S,L A'S
 TMZ'S,T M Z'S
 HIV,H I V
 FTC,F T C
 EU,E U
 PHD,P H D
 AI,A I
 FHI,F H I
 ICML,I C M L
 ICLR,I C L R
 BMW,B M W
 EV,E V
 CR,C R
 API,A P I
 ICO,I C O
 LTE,L T E
 OBS,O B S
 PC,P C
 IO,I O
 CRM,C R M
 RTMP,R T M P
 ASMR,A S M R
 GG,G G
 WWW,W W W
 PEI,P E I
 JJ,J J
 PT,P T
 DJ,D J
 SD,S D
 POW,P.O.W.,P O W
 FYI,F Y I
 DC,D C,D.C
 ABC,A B C
 TJ,T J
 WMDT,W M D T
 WDTN,W D T N
 TY,T Y
 EJ,E J
 CJ,C J
 ACL,A C L
 UK'S,U K'S
 GTV,G T V
 MDMA,M D M A
 DFW,D F W
 WTF,W T F
 AJ,A J
 MD,M D
 PH,P H
 ID,I D
 SEO,S E O
 UTM'S,U T M'S
 EC,E C
 UFC,U F C
 RV,R V
 UTM,U T M
 CSV,C S V
 SMS,S M S
 GRB,G R B
 GT,G T
 LEM,L E M
 XR,X R
 EDU,E D U
 NBC,N B C
 EMS,E M S
 CDC,C D C
 MLK,M L K
 IE,I E
 OC,O C
 HR,H R
 MA,M A
 DEE,D E E
 AP,A P
 UFO,U F O
 DE,D E
 LGBTQ,L G B T Q
 PTA,P T A
 NHS,N H S
 CMA,C M A
 MGM,M G M
 AKA,A K A
 HW,H W
 GOP,G O P
 GOP'S,G O P'S
 FBI,F B I
 PRX,P R X
 CTO,C T O
 URL,U R L
 EIN,E I N
 MLS,M L S
 CSI,C S I
 AOC,A O C
 CND,C N D
 CP,C P
 PP,P P
 CLI,C L I
 PB,P B
 FDA,F D A
 MRNA,M R N A
 PR,P R
 VP,V P
 DNC,D N C
 MSNBC,M S N B C
 GQ,G Q
 UT,U T
 XXI,X X I
 HRV,H R V
 WHO,W H O
 CRO,C R O
 DPA,D P A
 PPE,P P E
 EVA,E V A
 BP,B P
 GPS,G P S
 AR,A R
 PJ,P J
 MLM,M L M
 OLED,O L E D
 BO,B O
 VE,V E
 UN,U N
 SLS,S L S
 DM,D M
 DM'S,D M'S
 ASAP,A S A P
 ETA,E T A
 DOB,D O B
 BMW,B M W
--- a/utils/speechio/interjections_en.csv
+++ b/utils/speechio/interjections_en.csv
@@ -0,0 +1,20 @@
 ach
 ah
 eee
 eh
 er
 ew
 ha
 hee
 hm
 hmm
 hmmm
 huh
 mm
 mmm
 oof
 uh
 uhh
 um
 oh
 hum
--- a/utils/speechio/nemo_text_processing/README.md
+++ b/utils/speechio/nemo_text_processing/README.md
@@ -0,0 +1 @@
 nemo_version from commit:eae1684f7f33c2a18de9ecfa42ec7db93d39e631
--- a/utils/speechio/nemo_text_processing/init.py
+++ b/utils/speechio/nemo_text_processing/init.py
@@ -0,0 +1,13 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/utils/speechio/nemo_text_processing/text_normalization/README.md
+++ b/utils/speechio/nemo_text_processing/text_normalization/README.md
@@ -0,0 +1,10 @@
 # Text Normalization
 Text Normalization is part of NeMo's `nemo_text_processing` - a Python package that is installed with the `nemo_toolkit`. 
 It converts text from written form into its verbalized form, e.g. "123" -> "one hundred twenty three".
 See [NeMo documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html) for details.
 Tutorial with overview of the package capabilities: [Text_(Inverse)_Normalization.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb)
 Tutorial on how to customize the underlying gramamrs: [WFST_Tutorial.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/WFST_Tutorial.ipynb)
--- a/utils/speechio/nemo_text_processing/text_normalization/init.py
+++ b/utils/speechio/nemo_text_processing/text_normalization/init.py
@@ -0,0 +1,13 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/utils/speechio/nemo_text_processing/text_normalization/data_loader_utils.py
+++ b/utils/speechio/nemo_text_processing/text_normalization/data_loader_utils.py
@@ -0,0 +1,350 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
 import re
 import string
 from collections import defaultdict, namedtuple
 from typing import Dict, List, Optional, Set, Tuple
 from unicodedata import category
 EOS_TYPE = "EOS"
 PUNCT_TYPE = "PUNCT"
 PLAIN_TYPE = "PLAIN"
 Instance = namedtuple('Instance', 'token_type un_normalized normalized')
 known_types = [
    "PLAIN",
    "DATE",
    "CARDINAL",
    "LETTERS",
    "VERBATIM",
    "MEASURE",
    "DECIMAL",
    "ORDINAL",
    "DIGIT",
    "MONEY",
    "TELEPHONE",
    "ELECTRONIC",
    "FRACTION",
    "TIME",
    "ADDRESS",
 ]
 def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]:
    """
    https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish
    Loads text file in the Kaggle Google text normalization file format: <semiotic class>\t<unnormalized text>\t<`self` if trivial class or normalized text>
    E.g. 
    PLAIN   Brillantaisia   <self>
    PLAIN   is      <self>
    PLAIN   a       <self>
    PLAIN   genus   <self>
    PLAIN   of      <self>
    PLAIN   plant   <self>
    PLAIN   in      <self>
    PLAIN   family  <self>
    PLAIN   Acanthaceae     <self>
    PUNCT   .       sil
    <eos>   <eos>
    Args:
        file_path: file path to text file
    Returns: flat list of instances 
    """
    res = []
    with open(file_path, 'r') as fp:
        for line in fp:
            parts = line.strip().split("\t")
            if parts[0] == "<eos>":
                res.append(Instance(token_type=EOS_TYPE, un_normalized="", normalized=""))
            else:
                l_type, l_token, l_normalized = parts
                l_token = l_token.lower()
                l_normalized = l_normalized.lower()
                if l_type == PLAIN_TYPE:
                    res.append(Instance(token_type=l_type, un_normalized=l_token, normalized=l_token))
                elif l_type != PUNCT_TYPE:
                    res.append(Instance(token_type=l_type, un_normalized=l_token, normalized=l_normalized))
    return res
 def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file) -> List[Instance]:
    """
    Load given list of text files using the `load_func` function.
    Args: 
        file_paths: list of file paths
        load_func: loading function
    Returns: flat list of instances
    """
    res = []
    for file_path in file_paths:
        res.extend(load_func(file_path=file_path))
    return res
 def clean_generic(text: str) -> str:
    """
    Cleans text without affecting semiotic classes.
    Args:
        text: string
    Returns: cleaned string
    """
    text = text.strip()
    text = text.lower()
    return text
 def evaluate(preds: List[str], labels: List[str], input: Optional[List[str]] = None, verbose: bool = True) -> float:
    """
    Evaluates accuracy given predictions and labels. 
    Args:
        preds: predictions
        labels: labels
        input: optional, only needed for verbosity
        verbose: if true prints [input], golden labels and predictions
    Returns accuracy
    """
    acc = 0
    nums = len(preds)
    for i in range(nums):
        pred_norm = clean_generic(preds[i])
        label_norm = clean_generic(labels[i])
        if pred_norm == label_norm:
            acc = acc + 1
        else:
            if input:
                print(f"inpu: {json.dumps(input[i])}")
            print(f"gold: {json.dumps(label_norm)}")
            print(f"pred: {json.dumps(pred_norm)}")
    return acc / nums
 def training_data_to_tokens(
    data: List[Instance], category: Optional[str] = None
 ) -> Dict[str, Tuple[List[str], List[str]]]:
    """
    Filters the instance list by category if provided and converts it into a map from token type to list of un_normalized and normalized strings
    Args:
        data: list of instances
        category: optional semiotic class category name
    Returns Dict: token type -> (list of un_normalized strings, list of normalized strings)
    """
    result = defaultdict(lambda: ([], []))
    for instance in data:
        if instance.token_type != EOS_TYPE:
            if category is None or instance.token_type == category:
                result[instance.token_type][0].append(instance.un_normalized)
                result[instance.token_type][1].append(instance.normalized)
    return result
 def training_data_to_sentences(data: List[Instance]) -> Tuple[List[str], List[str], List[Set[str]]]:
    """
    Takes instance list, creates list of sentences split by EOS_Token
    Args:
        data: list of instances
    Returns (list of unnormalized sentences, list of normalized sentences, list of sets of categories in a sentence)
    """
    # split data at EOS boundaries
    sentences = []
    sentence = []
    categories = []
    sentence_categories = set()
    for instance in data:
        if instance.token_type == EOS_TYPE:
            sentences.append(sentence)
            sentence = []
            categories.append(sentence_categories)
            sentence_categories = set()
        else:
            sentence.append(instance)
            sentence_categories.update([instance.token_type])
    un_normalized = [" ".join([instance.un_normalized for instance in sentence]) for sentence in sentences]
    normalized = [" ".join([instance.normalized for instance in sentence]) for sentence in sentences]
    return un_normalized, normalized, categories
 def post_process_punctuation(text: str) -> str:
    """
    Normalized quotes and spaces
    Args:
        text: text
    Returns: text with normalized spaces and quotes
    """
    text = (
        text.replace('( ', '(')
        .replace(' )', ')')
        .replace('{ ', '{')
        .replace(' }', '}')
        .replace('[ ', '[')
        .replace(' ]', ']')
        .replace('  ', ' ')
        .replace('”', '"')
        .replace("’", "'")
        .replace("»", '"')
        .replace("«", '"')
        .replace("\\", "")
        .replace("„", '"')
        .replace("´", "'")
        .replace("’", "'")
        .replace('“', '"')
        .replace("‘", "'")
        .replace('`', "'")
        .replace('- -', "--")
    )
    for punct in "!,.:;?":
        text = text.replace(f' {punct}', punct)
    return text.strip()
 def pre_process(text: str) -> str:
    """
    Optional text preprocessing before normalization (part of TTS TN pipeline)
    Args:
        text: string that may include semiotic classes
    Returns: text with spaces around punctuation marks
    """
    space_both = '[]'
    for punct in space_both:
        text = text.replace(punct, ' ' + punct + ' ')
    # remove extra space
    text = re.sub(r' +', ' ', text)
    return text
 def load_file(file_path: str) -> List[str]:
    """
    Loads given text file with separate lines into list of string.
    Args: 
        file_path: file path
    Returns: flat list of string
    """
    res = []
    with open(file_path, 'r') as fp:
        for line in fp:
            res.append(line)
    return res
 def write_file(file_path: str, data: List[str]):
    """
    Writes out list of string to file.
    Args:
        file_path: file path
        data: list of string
    """
    with open(file_path, 'w') as fp:
        for line in data:
            fp.write(line + '\n')
 def post_process_punct(input: str, normalized_text: str, add_unicode_punct: bool = False):
    """
    Post-processing of the normalized output to match input in terms of spaces around punctuation marks.
    After NN normalization, Moses detokenization puts a space after
    punctuation marks, and attaches an opening quote "'" to the word to the right.
    E.g., input to the TN NN model is "12 test' example",
    after normalization and detokenization -> "twelve test 'example" (the quote is considered to be an opening quote,
    but it doesn't match the input and can cause issues during TTS voice generation.)
    The current function will match the punctuation and spaces of the normalized text with the input sequence.
    "12 test' example" -> "twelve test 'example" -> "twelve test' example" (the quote was shifted to match the input).
    Args:
        input: input text (original input to the NN, before normalization or tokenization)
        normalized_text: output text (output of the TN NN model)
        add_unicode_punct: set to True to handle unicode punctuation marks as well as default string.punctuation (increases post processing time)
    """
    # in the post-processing WFST graph "``" are repalced with '"" quotes (otherwise single quotes "`" won't be handled correctly)
    # this function fixes spaces around them based on input sequence, so here we're making the same double quote replacement
    # to make sure these new double quotes work with this function
    if "``" in input and "``" not in normalized_text:
        input = input.replace("``", '"')
    input = [x for x in input]
    normalized_text = [x for x in normalized_text]
    punct_marks = [x for x in string.punctuation if x in input]
    if add_unicode_punct:
        punct_unicode = [
            chr(i)
            for i in range(sys.maxunicode)
            if category(chr(i)).startswith("P") and chr(i) not in punct_default and chr(i) in input
        ]
        punct_marks = punct_marks.extend(punct_unicode)
    for punct in punct_marks:
        try:
            equal = True
            if input.count(punct) != normalized_text.count(punct):
                equal = False
            idx_in, idx_out = 0, 0
            while punct in input[idx_in:]:
                idx_out = normalized_text.index(punct, idx_out)
                idx_in = input.index(punct, idx_in)
                def _is_valid(idx_out, idx_in, normalized_text, input):
                    """Check if previous or next word match (for cases when punctuation marks are part of
                    semiotic token, i.e. some punctuation can be missing in the normalized text)"""
                    return (idx_out > 0 and idx_in > 0 and normalized_text[idx_out - 1] == input[idx_in - 1]) or (
                        idx_out < len(normalized_text) - 1
                        and idx_in < len(input) - 1
                        and normalized_text[idx_out + 1] == input[idx_in + 1]
                    )
                if not equal and not _is_valid(idx_out, idx_in, normalized_text, input):
                    idx_in += 1
                    continue
                if idx_in > 0 and idx_out > 0:
                    if normalized_text[idx_out - 1] == " " and input[idx_in - 1] != " ":
                        normalized_text[idx_out - 1] = ""
                    elif normalized_text[idx_out - 1] != " " and input[idx_in - 1] == " ":
                        normalized_text[idx_out - 1] += " "
                if idx_in < len(input) - 1 and idx_out < len(normalized_text) - 1:
                    if normalized_text[idx_out + 1] == " " and input[idx_in + 1] != " ":
                        normalized_text[idx_out + 1] = ""
                    elif normalized_text[idx_out + 1] != " " and input[idx_in + 1] == " ":
                        normalized_text[idx_out] = normalized_text[idx_out] + " "
                idx_out += 1
                idx_in += 1
        except:
            pass
    normalized_text = "".join(normalized_text)
    return re.sub(r' +', ' ', normalized_text)
--- a/utils/speechio/nemo_text_processing/text_normalization/en/init.py
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/init.py
@@ -0,0 +1,17 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
 from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst
 from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst
--- a/utils/speechio/nemo_text_processing/text_normalization/en/clean_eval_data.py
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/clean_eval_data.py
@@ -0,0 +1,342 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from argparse import ArgumentParser
 from typing import List
 import regex as re
 from nemo_text_processing.text_normalization.data_loader_utils import (
    EOS_TYPE,
    Instance,
    load_files,
    training_data_to_sentences,
 )
 """
 This file is for evaluation purposes.
 filter_loaded_data() cleans data (list of instances) for text normalization. Filters and cleaners can be specified for each semiotic class individually.
 For example, normalized text should only include characters and whitespace characters but no punctuation. 
            Cardinal unnormalized instances should contain at least one integer and all other characters are removed.
 """
 class Filter:
    """
    Filter class
    Args:
        class_type: semiotic class used in dataset
        process_func: function to transform text
        filter_func:  function to filter text
    """
    def __init__(self, class_type: str, process_func: object, filter_func: object):
        self.class_type = class_type
        self.process_func = process_func
        self.filter_func = filter_func
    def filter(self, instance: Instance) -> bool:
        """
        filter function
        Args:
            filters given instance with filter function
        Returns: True if given instance fulfills criteria or does not belong to class type
        """
        if instance.token_type != self.class_type:
            return True
        return self.filter_func(instance)
    def process(self, instance: Instance) -> Instance:
        """
        process function
        Args:
            processes given instance with process function
        Returns: processed instance if instance belongs to expected class type or original instance
        """
        if instance.token_type != self.class_type:
            return instance
        return self.process_func(instance)
 def filter_cardinal_1(instance: Instance) -> bool:
    ok = re.search(r"[0-9]", instance.un_normalized)
    return ok
 def process_cardinal_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    un_normalized = re.sub(r"[^0-9]", "", un_normalized)
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_ordinal_1(instance: Instance) -> bool:
    ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized)
    return ok
 def process_ordinal_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    un_normalized = re.sub(r"[,\s]", "", un_normalized)
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_decimal_1(instance: Instance) -> bool:
    ok = re.search(r"[0-9]", instance.un_normalized)
    return ok
 def process_decimal_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    un_normalized = re.sub(r",", "", un_normalized)
    normalized = instance.normalized
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_measure_1(instance: Instance) -> bool:
    ok = True
    return ok
 def process_measure_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    un_normalized = re.sub(r",", "", un_normalized)
    un_normalized = re.sub(r"m2", "m²", un_normalized)
    un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized)
    normalized = re.sub(r"[^a-z\s]", "", normalized)
    normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized)
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_money_1(instance: Instance) -> bool:
    ok = re.search(r"[0-9]", instance.un_normalized)
    return ok
 def process_money_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    un_normalized = re.sub(r",", "", un_normalized)
    un_normalized = re.sub(r"a\$", r"$", un_normalized)
    un_normalized = re.sub(r"us\$", r"$", un_normalized)
    un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized)
    un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized)
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_time_1(instance: Instance) -> bool:
    ok = re.search(r"[0-9]", instance.un_normalized)
    return ok
 def process_time_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    un_normalized = re.sub(r": ", ":", un_normalized)
    un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized)
    un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized)
    normalized = instance.normalized
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_plain_1(instance: Instance) -> bool:
    ok = True
    return ok
 def process_plain_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_punct_1(instance: Instance) -> bool:
    ok = True
    return ok
 def process_punct_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_date_1(instance: Instance) -> bool:
    ok = True
    return ok
 def process_date_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    un_normalized = re.sub(r",", "", un_normalized)
    normalized = instance.normalized
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_letters_1(instance: Instance) -> bool:
    ok = True
    return ok
 def process_letters_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_verbatim_1(instance: Instance) -> bool:
    ok = True
    return ok
 def process_verbatim_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_digit_1(instance: Instance) -> bool:
    ok = re.search(r"[0-9]", instance.un_normalized)
    return ok
 def process_digit_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_telephone_1(instance: Instance) -> bool:
    ok = re.search(r"[0-9]", instance.un_normalized)
    return ok
 def process_telephone_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_electronic_1(instance: Instance) -> bool:
    ok = re.search(r"[0-9]", instance.un_normalized)
    return ok
 def process_electronic_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_fraction_1(instance: Instance) -> bool:
    ok = re.search(r"[0-9]", instance.un_normalized)
    return ok
 def process_fraction_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 def filter_address_1(instance: Instance) -> bool:
    ok = True
    return ok
 def process_address_1(instance: Instance) -> Instance:
    un_normalized = instance.un_normalized
    normalized = instance.normalized
    normalized = re.sub(r"[^a-z ]", "", normalized)
    return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
 filters = []
 filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1))
 filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1))
 filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1))
 filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1))
 filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1))
 filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1))
 filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1))
 filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1))
 filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1))
 filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1))
 filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1))
 filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1))
 filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1))
 filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1))
 filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1))
 filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1))
 filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True))
 def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]:
    """
    Filters list of instances
    Args:
        data: list of instances
    Returns: filtered and transformed list of instances
    """
    updates_instances = []
    for instance in data:
        updated_instance = False
        for fil in filters:
            if fil.class_type == instance.token_type and fil.filter(instance):
                instance = fil.process(instance)
                updated_instance = True
        if updated_instance:
            if verbose:
                print(instance)
            updates_instances.append(instance)
    return updates_instances
 def parse_args():
    parser = ArgumentParser()
    parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100')
    parser.add_argument("--verbose", help="print filtered instances", action='store_true')
    return parser.parse_args()
 if __name__ == "__main__":
    args = parse_args()
    file_path = args.input
    print("Loading training data: " + file_path)
    instance_list = load_files([file_path])  # List of instances
    filtered_instance_list = filter_loaded_data(instance_list, args.verbose)
    training_data_to_sentences(filtered_instance_list)
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/init.py
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/init.py
@@ -0,0 +1,13 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/address/init.py
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/address/init.py
@@ -0,0 +1,13 @@
 # Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/address/address_word.tsv
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/address/address_word.tsv
@@ -0,0 +1,14 @@
 st	Street
 street	Street
 expy	Expressway
 fwy	Freeway
 hwy	Highway
 dr	Drive
 ct	Court
 ave	Avenue
 av	Avenue
 cir	Circle
 blvd	Boulevard
 alley	Alley
 way	Way
 jct	Junction
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/address/state.tsv
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/address/state.tsv
@@ -0,0 +1,52 @@
 Alabama	AL
 Alaska	AK
 Arizona	AZ
 Arkansas	AR
 California	CA
 Colorado	CO
 Connecticut	CT
 Delaware	DE
 Florida	FL
 Georgia	GA
 Hawaii	HI
 Idaho	ID
 Illinois	IL
 Indiana	IN
 Indiana	IND
 Iowa	IA
 Kansas	KS
 Kentucky	KY
 Louisiana	LA
 Maine	ME
 Maryland	MD
 Massachusetts	MA
 Michigan	MI
 Minnesota	MN
 Mississippi	MS
 Missouri	MO
 Montana	MT
 Nebraska	NE
 Nevada	NV
 New Hampshire	NH
 New Jersey	NJ
 New Mexico	NM
 New York	NY
 North Carolina	NC
 North Dakota	ND
 Ohio	OH
 Oklahoma	OK
 Oregon	OR
 Pennsylvania	PA
 Rhode Island	RI
 South Carolina	SC
 South Dakota	SD
 Tennessee	TN
 Tennessee	TENN
 Texas	TX
 Utah	UT
 Vermont	VT
 Virginia	VA
 Washington	WA
 West Virginia	WV
 Wisconsin	WI
 Wyoming	WY
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/date/init.py
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/date/init.py
@@ -0,0 +1,13 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/date/day.tsv
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/date/day.tsv
@@ -0,0 +1,31 @@
 one
 two
 three
 four
 five
 six
 seven
 eight
 nine
 ten
 eleven
 twelve
 thirteen
 fourteen
 fifteen
 sixteen
 seventeen
 eighteen
 nineteen
 twenty
 twenty one
 twenty two
 twenty three
 twenty four
 twenty five
 twenty six
 twenty seven
 twenty eight
 twenty nine
 thirty
 thirty one
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/date/month_abbr.tsv
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/date/month_abbr.tsv
@@ -0,0 +1,12 @@
 jan	january
 feb	february
 mar	march
 apr	april
 jun	june
 jul	july
 aug	august
 sep	september
 sept	september
 oct	october
 nov	november
 dec	december
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/date/month_name.tsv
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/date/month_name.tsv
@@ -0,0 +1,12 @@
 january
 february
 march
 april
 may
 june
 july
 august
 september
 october
 november
 december
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/date/month_number.tsv
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/date/month_number.tsv
@@ -0,0 +1,24 @@
 1	january
 2	february
 3	march
 4	april
 5	may
 6	june
 7	july
 8	august
 9	september
 10	october
 11	november
 12	december
 01	january
 02	february
 03	march
 04	april
 05	may
 06	june
 07	july
 08	august
 09	september
 10	october
 11	november
 12	december
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/date/year_suffix.tsv
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/date/year_suffix.tsv
@@ -0,0 +1,16 @@
 A. D	AD
 A.D	AD
 a. d	AD
 a.d	AD
 a. d.	AD
 a.d.	AD
 B. C	BC
 B.C	BC
 b. c	BC
 b.c	BC
 A. D.	AD
 A.D.	AD
 B. C.	BC
 B.C.	BC
 b. c.	BC
 b.c.	BC
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/electronic/init.py
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/electronic/init.py
@@ -0,0 +1,13 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/electronic/domain.tsv
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/electronic/domain.tsv
@@ -0,0 +1,12 @@
 .com	dot com
 .org	dot org
 .gov	dot gov
 .uk	dot UK
 .fr	dot FR
 .net	dot net
 .br	dot BR
 .in	dot IN
 .ru	dot RU
 .de	dot DE
 .it	dot IT
 .jpg	dot jpeg
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/electronic/symbol.tsv
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/electronic/symbol.tsv
@@ -0,0 +1,21 @@
 .	dot
 -	dash
 _	underscore
 !	exclamation mark
 #	number sign
 $	dollar sign
 %	percent sign
 &	ampersand
 '	quote
 *	asterisk
 +	plus
 /	slash
 =	equal sign
 ?	question mark
 ^	circumflex
 `	right single quote
 {	left brace
 |	vertical bar
 }	right brace
 ~	tilde
 ,	comma
--- a/utils/speechio/nemo_text_processing/text_normalization/en/data/measure/init.py
+++ b/utils/speechio/nemo_text_processing/text_normalization/en/data/measure/init.py
@@ -0,0 +1,13 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`nemo_version from commit:eae1684f7f33c2a18de9ecfa42ec7db93d39e631`