diff --git a/.DS_Store b/.DS_Store index 1962b1d..ddd7790 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 81c6be4..0000000 --- a/Dockerfile +++ /dev/null @@ -1,49 +0,0 @@ - -FROM harbor.4pd.io/inf/base-python3.8-ubuntu:1.1.0 -MAINTAINER shiguangchuan@4paradigm.com - -WORKDIR /workspace - -COPY ssh-keygen /bin - -RUN wget -q ftp://ftp.4pd.io/pub/pico/temp/pynini-2.1.6-cp38-cp38-manylinux_2_31_x86_64.whl && pip install pynini-2.1.6-cp38-cp38-manylinux_2_31_x86_64.whl && rm -f pynini-2.1.6-c p38-cp38-manylinux_2_31_x86_64.whl - -ADD ./requirements.txt /workspace -RUN pip install -r ./requirements.txt -i https://nexus.4pd.io/repository/pypi-all/simple --trusted-host nexus.4pd.io --extra-index-url https://mirrors.aliyun.com/pypi/simple/ \ - && pip cache purge \ - && ssh-keygen -f /workspace/ssh-key-ecdsa -t ecdsa -b 521 -q -N "" - -ADD . /workspace - -EXPOSE 80 - -CMD ["python3", "run_callback.py"] - - -########################### -## Dockerfile(更新后) -#FROM harbor.4pd.io/lab-platform/inf/python:3.9 - -#WORKDIR /app - -## 安装依赖 -##RUN pip install torch librosa flask - -##RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && \ -## pip cache purge && \ -## pip --default-timeout=1000 install torch librosa flask - -## 删除原来的 COPY pytorch_model.bin /app/ - -#COPY inference.py /app/ -# 只需要复制启动脚本 - -#EXPOSE 80 - -#CMD ["python", "inference.py"] -#################### - - -##############################更新0731################################# - - diff --git a/helm-chart/.DS_Store b/helm-chart/.DS_Store deleted file mode 100644 index 38ac210..0000000 Binary files a/helm-chart/.DS_Store and /dev/null differ diff --git a/helm-chart/README.md b/helm-chart/README.md deleted file mode 100644 index 90bd7e3..0000000 --- a/helm-chart/README.md +++ /dev/null @@ -1,77 +0,0 @@ -## judgeflow chart 的要求 - -### values.yaml 文件必须包含如下字段,并且模板中必须引用 values.yaml 中的如下字段 - -``` -podLabels -env -volumeMounts -volumes -affinity -``` - -### values.yaml 文件必须在 volumeMounts 中声明如下卷 - -``` -workspace -submit -datafile -``` - -## 被测服务(sut) chart 的要求 - -### values.yaml 文件必须包含如下字段,并且资源模板中必须引用 values.yaml 中的如下字段 - -``` -podLabels -affinity -``` - -针对 podLabels 字段,values.yaml 中配置格式如下: - -``` -podLabels: {} -``` - -下面给出示例 - -podLabels - -values.yaml - -templates/deployment.yaml - -``` -metadata: - labels: - {{- with .Values.podLabels }} - {{- toYaml . | nindent 4 }} - {{- end }} -``` - -affinity - -values.yaml - -``` -affinity: {} -``` - -templates/deployment.yaml - -``` -spec: - template: - spec: - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} -``` - -### 如果需要在 sut 中使用共享存储,则 sut chart 的 values.yaml 也必须包含如下字段,且模板中必须引用 values.yaml 中的如下字段 - -``` -volumeMounts -volumes -``` diff --git a/helm-chart/asr-tco/.helmignore b/helm-chart/asr-tco/.helmignore deleted file mode 100644 index 0e8a0eb..0000000 --- a/helm-chart/asr-tco/.helmignore +++ /dev/null @@ -1,23 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*.orig -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/helm-chart/asr-tco/Chart.yaml.tmpl b/helm-chart/asr-tco/Chart.yaml.tmpl deleted file mode 100644 index 35a3153..0000000 --- a/helm-chart/asr-tco/Chart.yaml.tmpl +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v2 -name: ${chartName} -description: Leaderboard judgeflow helm chart for demo - -# A chart can be either an 'application' or a 'library' chart. -# -# Application charts are a collection of templates that can be packaged into versioned archives -# to be deployed. -# -# Library charts provide useful utilities or functions for the chart developer. They're included as -# a dependency of application charts to inject those utilities and functions into the rendering -# pipeline. Library charts do not define any templates and therefore cannot be deployed. -type: application - -# This is the chart version. This version number should be incremented each time you make changes -# to the chart and its templates, including the app version. -# Versions are expected to follow Semantic Versioning (https://semver.org/) -version: ${version} - -# This is the version number of the application being deployed. This version number should be -# incremented each time you make changes to the application. Versions are not expected to -# follow Semantic Versioning. They should reflect the version the application is using. -# It is recommended to use it with quotes. -appVersion: "${appVersion}" diff --git a/helm-chart/asr-tco/templates/_helpers.tpl b/helm-chart/asr-tco/templates/_helpers.tpl deleted file mode 100644 index e373350..0000000 --- a/helm-chart/asr-tco/templates/_helpers.tpl +++ /dev/null @@ -1,62 +0,0 @@ -{{/* -Expand the name of the chart. -*/}} -{{- define "judgeflow.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "judgeflow.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "judgeflow.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Common labels -*/}} -{{- define "judgeflow.labels" -}} -helm.sh/chart: {{ include "judgeflow.chart" . }} -{{ include "judgeflow.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end }} - -{{/* -Selector labels -*/}} -{{- define "judgeflow.selectorLabels" -}} -app.kubernetes.io/name: {{ include "judgeflow.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end }} - -{{/* -Create the name of the service account to use -*/}} -{{- define "judgeflow.serviceAccountName" -}} -{{- if .Values.serviceAccount.create }} -{{- default (include "judgeflow.fullname" .) .Values.serviceAccount.name }} -{{- else }} -{{- default "default" .Values.serviceAccount.name }} -{{- end }} -{{- end }} diff --git a/helm-chart/asr-tco/templates/hpa.yaml b/helm-chart/asr-tco/templates/hpa.yaml deleted file mode 100644 index 45ab478..0000000 --- a/helm-chart/asr-tco/templates/hpa.yaml +++ /dev/null @@ -1,32 +0,0 @@ -{{- if .Values.autoscaling.enabled }} -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: {{ include "judgeflow.fullname" . }} - labels: - {{- include "judgeflow.labels" . | nindent 4 }} -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: {{ include "judgeflow.fullname" . }} - minReplicas: {{ .Values.autoscaling.minReplicas }} - maxReplicas: {{ .Values.autoscaling.maxReplicas }} - metrics: - {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} - {{- end }} - {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} - {{- end }} -{{- end }} diff --git a/helm-chart/asr-tco/templates/ingress.yaml b/helm-chart/asr-tco/templates/ingress.yaml deleted file mode 100644 index 959d442..0000000 --- a/helm-chart/asr-tco/templates/ingress.yaml +++ /dev/null @@ -1,61 +0,0 @@ -{{- if .Values.ingress.enabled -}} -{{- $fullName := include "judgeflow.fullname" . -}} -{{- $svcPort := .Values.service.port -}} -{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} - {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} - {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} - {{- end }} -{{- end }} -{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1 -{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1beta1 -{{- else -}} -apiVersion: extensions/v1beta1 -{{- end }} -kind: Ingress -metadata: - name: {{ $fullName }} - labels: - {{- include "judgeflow.labels" . | nindent 4 }} - {{- with .Values.ingress.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} - ingressClassName: {{ .Values.ingress.className }} - {{- end }} - {{- if .Values.ingress.tls }} - tls: - {{- range .Values.ingress.tls }} - - hosts: - {{- range .hosts }} - - {{ . | quote }} - {{- end }} - secretName: {{ .secretName }} - {{- end }} - {{- end }} - rules: - {{- range .Values.ingress.hosts }} - - host: {{ .host | quote }} - http: - paths: - {{- range .paths }} - - path: {{ .path }} - {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} - pathType: {{ .pathType }} - {{- end }} - backend: - {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} - service: - name: {{ $fullName }} - port: - number: {{ $svcPort }} - {{- else }} - serviceName: {{ $fullName }} - servicePort: {{ $svcPort }} - {{- end }} - {{- end }} - {{- end }} -{{- end }} diff --git a/helm-chart/asr-tco/templates/job.yaml b/helm-chart/asr-tco/templates/job.yaml deleted file mode 100644 index bc8e51a..0000000 --- a/helm-chart/asr-tco/templates/job.yaml +++ /dev/null @@ -1,63 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: {{ include "judgeflow.fullname" . }} - labels: - {{- include "judgeflow.labels" . | nindent 4 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - template: - metadata: - labels: - {{- include "judgeflow.labels" . | nindent 8 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - spec: - {{- with .Values.priorityclassname }} - priorityClassName: "{{ . }}" - {{- end }} - containers: - - name: {{ .Chart.Name }} - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - {{- with .Values.env }} - env: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- if and (hasKey .Values "service") (hasKey .Values.service "ports") }} - ports: - {{- range .Values.service.ports }} - - name: {{ .name }} - containerPort: {{ .port }} - {{- end }} - {{- end }} - {{- if hasKey .Values "command" }} - command: {{ .Values.command }} - {{- end }} - volumeMounts: - {{- toYaml .Values.volumeMounts | nindent 12 }} - resources: - {{- toYaml .Values.resources | nindent 12 }} - restartPolicy: Never - {{- with .Values.volumes }} - volumes: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - backoffLimit: 0 diff --git a/helm-chart/asr-tco/templates/priorityclass.yaml b/helm-chart/asr-tco/templates/priorityclass.yaml deleted file mode 100644 index 7e1a884..0000000 --- a/helm-chart/asr-tco/templates/priorityclass.yaml +++ /dev/null @@ -1,10 +0,0 @@ -{{- if .Values.priorityclassname }} -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: "{{ .Values.priorityclassname }}" -value: {{ .Values.priorityclassvalue }} -globalDefault: false -preemptionPolicy: "Never" -description: "This is a priority class." -{{- end }} diff --git a/helm-chart/asr-tco/templates/service.yaml b/helm-chart/asr-tco/templates/service.yaml deleted file mode 100644 index 034a5d1..0000000 --- a/helm-chart/asr-tco/templates/service.yaml +++ /dev/null @@ -1,22 +0,0 @@ -{{- if and (hasKey .Values "service") (hasKey .Values.service "type") }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "judgeflow.fullname" . }} - labels: - {{- include "judgeflow.labels" . | nindent 4 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - type: {{ .Values.service.type }} - ports: - {{- range .Values.service.ports }} - - port: {{ .port }} - targetPort: {{ .port }} - protocol: TCP - name: {{ .name }} - {{- end }} - selector: - {{- include "judgeflow.selectorLabels" . | nindent 4 }} -{{- end }} diff --git a/helm-chart/asr-tco/templates/serviceaccount.yaml b/helm-chart/asr-tco/templates/serviceaccount.yaml deleted file mode 100644 index 12df5c8..0000000 --- a/helm-chart/asr-tco/templates/serviceaccount.yaml +++ /dev/null @@ -1,13 +0,0 @@ -{{- if .Values.serviceAccount.create -}} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "judgeflow.serviceAccountName" . }} - labels: - {{- include "judgeflow.labels" . | nindent 4 }} - {{- with .Values.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -automountServiceAccountToken: {{ .Values.serviceAccount.automount }} -{{- end }} diff --git a/helm-chart/asr-tco/templates/tests/test-connection.yaml b/helm-chart/asr-tco/templates/tests/test-connection.yaml deleted file mode 100644 index c351ca2..0000000 --- a/helm-chart/asr-tco/templates/tests/test-connection.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: {{ include "judgeflow.fullname" . }}-test-connection - labels: - {{- include "judgeflow.labels" . | nindent 4 }} - annotations: - "helm.sh/hook": test -spec: - containers: - - name: wget - image: busybox - command: ['wget'] - args: ['{{ include "judgeflow.fullname" . }}:{{ .Values.service.port }}'] - restartPolicy: Never diff --git a/helm-chart/asr-tco/values.yaml.tmpl b/helm-chart/asr-tco/values.yaml.tmpl deleted file mode 100644 index 0c73595..0000000 --- a/helm-chart/asr-tco/values.yaml.tmpl +++ /dev/null @@ -1,124 +0,0 @@ -# Default values for job_demo. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -replicaCount: 1 - -image: - repository: "${imageRepo}" - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - tag: "${imageTag}" - -imagePullSecrets: [] -nameOverride: "" -fullnameOverride: "" - -serviceAccount: - # Specifies whether a service account should be created - create: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - -podAnnotations: {} - -podLabels: - contest.4pd.io/leaderboard-resource-type: judge_flow - contest.4pd.io/leaderboard-job-id: "0" - contest.4pd.io/leaderboard-submit-id: "0" - -podSecurityContext: {} - # fsGroup: 2000 - -securityContext: {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 - -service: - type: ClusterIP - ports: - - name: http - port: 80 - -ingress: - enabled: false - className: "" - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - hosts: - - host: chart-example.local - paths: - - path: / - pathType: ImplementationSpecific - tls: [] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - -resources: - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - limits: - cpu: 3000m - memory: 16Gi - requests: - cpu: 3000m - memory: 16Gi - -autoscaling: - enabled: false - minReplicas: 1 - maxReplicas: 100 - targetCPUUtilizationPercentage: 80 - # targetMemoryUtilizationPercentage: 80 - -nodeSelector: - juicefs: "on" - contest.4pd.io/cpu: INTEL-8358 - -tolerations: [] - -affinity: {} - -env: - - name: TZ - value: Asia/Shanghai - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - -#command: '["python","run.py"]' - -volumeMounts: - - name: workspace - mountPath: /tmp/workspace - - name: datafile - mountPath: /tmp/datafile - - name: submit - mountPath: /tmp/submit_config - - name: juicefs-pv - mountPath: /tmp/juicefs - - name: customer - mountPath: /tmp/customer - - name: submit-private - mountPath: /tmp/submit_private - -volumes: - - name: juicefs-pv - persistentVolumeClaim: - claimName: juicefs-pvc - - -priorityclassname: '' -priorityclassvalue: '0' diff --git a/helm-chart/sut/.DS_Store b/helm-chart/sut/.DS_Store deleted file mode 100644 index df37fdf..0000000 Binary files a/helm-chart/sut/.DS_Store and /dev/null differ diff --git a/helm-chart/sut/.helmignore b/helm-chart/sut/.helmignore deleted file mode 100644 index 0e8a0eb..0000000 --- a/helm-chart/sut/.helmignore +++ /dev/null @@ -1,23 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*.orig -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/helm-chart/sut/Chart.yaml b/helm-chart/sut/Chart.yaml deleted file mode 100644 index 5f95483..0000000 --- a/helm-chart/sut/Chart.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v2 -name: sut -description: A Helm chart for Kubernetes - -# A chart can be either an 'application' or a 'library' chart. -# -# Application charts are a collection of templates that can be packaged into versioned archives -# to be deployed. -# -# Library charts provide useful utilities or functions for the chart developer. They're included as -# a dependency of application charts to inject those utilities and functions into the rendering -# pipeline. Library charts do not define any templates and therefore cannot be deployed. -type: application - -# This is the chart version. This version number should be incremented each time you make changes -# to the chart and its templates, including the app version. -# Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 - -# This is the version number of the application being deployed. This version number should be -# incremented each time you make changes to the application. Versions are not expected to -# follow Semantic Versioning. They should reflect the version the application is using. -# It is recommended to use it with quotes. -appVersion: "0.1.0" diff --git a/helm-chart/sut/templates/_helpers.tpl b/helm-chart/sut/templates/_helpers.tpl deleted file mode 100644 index 501d682..0000000 --- a/helm-chart/sut/templates/_helpers.tpl +++ /dev/null @@ -1,62 +0,0 @@ -{{/* -Expand the name of the chart. -*/}} -{{- define "sut.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "sut.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "sut.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Common labels -*/}} -{{- define "sut.labels" -}} -helm.sh/chart: {{ include "sut.chart" . }} -{{ include "sut.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end }} - -{{/* -Selector labels -*/}} -{{- define "sut.selectorLabels" -}} -app.kubernetes.io/name: {{ include "sut.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end }} - -{{/* -Create the name of the service account to use -*/}} -{{- define "sut.serviceAccountName" -}} -{{- if .Values.serviceAccount.create }} -{{- default (include "sut.fullname" .) .Values.serviceAccount.name }} -{{- else }} -{{- default "default" .Values.serviceAccount.name }} -{{- end }} -{{- end }} diff --git a/helm-chart/sut/templates/deployment.yaml b/helm-chart/sut/templates/deployment.yaml deleted file mode 100644 index ecabb4e..0000000 --- a/helm-chart/sut/templates/deployment.yaml +++ /dev/null @@ -1,94 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "sut.fullname" . }} - labels: - {{- include "sut.labels" . | nindent 4 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if not .Values.autoscaling.enabled }} - replicas: {{ .Values.replicaCount }} - {{- end }} - selector: - matchLabels: - {{- include "sut.selectorLabels" . | nindent 6 }} - template: - metadata: - {{- with .Values.podAnnotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "sut.labels" . | nindent 8 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "sut.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - {{- with .Values.priorityclassname }} - priorityClassName: "{{ . }}" - {{- end }} - containers: - - name: {{ .Chart.Name }} - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - {{- with .Values.env }} - env: - {{- toYaml . | nindent 12 }} - {{- end }} - ports: - - name: http - containerPort: {{ .Values.service.port }} - protocol: TCP - {{- with .Values.command }} - command: - {{- toYaml . | nindent 12 }} - {{- end }} - resources: - {{- toYaml .Values.resources | nindent 12 }} - {{- with .Values.volumeMounts }} - volumeMounts: - {{- toYaml . | nindent 12 }} - {{- end }} - - {{- with .Values.livenessProbe }} - livenessProbe: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.readinessProbe }} - readinessProbe: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.startupProbe }} - startupProbe: - {{- toYaml . | nindent 12 }} - {{- end }} - - volumes: - {{- with .Values.volumes }} - {{- toYaml . | nindent 8 }} - {{- end }} - - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - tolerations: - - key: "hosttype" - operator: "Equal" - value: "iluvatar" - effect: "NoSchedule" \ No newline at end of file diff --git a/helm-chart/sut/templates/hpa.yaml b/helm-chart/sut/templates/hpa.yaml deleted file mode 100644 index b3b17a0..0000000 --- a/helm-chart/sut/templates/hpa.yaml +++ /dev/null @@ -1,32 +0,0 @@ -{{- if .Values.autoscaling.enabled }} -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: {{ include "sut.fullname" . }} - labels: - {{- include "sut.labels" . | nindent 4 }} -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: {{ include "sut.fullname" . }} - minReplicas: {{ .Values.autoscaling.minReplicas }} - maxReplicas: {{ .Values.autoscaling.maxReplicas }} - metrics: - {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} - {{- end }} - {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} - {{- end }} -{{- end }} diff --git a/helm-chart/sut/templates/ingress.yaml b/helm-chart/sut/templates/ingress.yaml deleted file mode 100644 index 4ecfe9b..0000000 --- a/helm-chart/sut/templates/ingress.yaml +++ /dev/null @@ -1,61 +0,0 @@ -{{- if .Values.ingress.enabled -}} -{{- $fullName := include "sut.fullname" . -}} -{{- $svcPort := .Values.service.port -}} -{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} - {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} - {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} - {{- end }} -{{- end }} -{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1 -{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1beta1 -{{- else -}} -apiVersion: extensions/v1beta1 -{{- end }} -kind: Ingress -metadata: - name: {{ $fullName }} - labels: - {{- include "sut.labels" . | nindent 4 }} - {{- with .Values.ingress.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} - ingressClassName: {{ .Values.ingress.className }} - {{- end }} - {{- if .Values.ingress.tls }} - tls: - {{- range .Values.ingress.tls }} - - hosts: - {{- range .hosts }} - - {{ . | quote }} - {{- end }} - secretName: {{ .secretName }} - {{- end }} - {{- end }} - rules: - {{- range .Values.ingress.hosts }} - - host: {{ .host | quote }} - http: - paths: - {{- range .paths }} - - path: {{ .path }} - {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} - pathType: {{ .pathType }} - {{- end }} - backend: - {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} - service: - name: {{ $fullName }} - port: - number: {{ $svcPort }} - {{- else }} - serviceName: {{ $fullName }} - servicePort: {{ $svcPort }} - {{- end }} - {{- end }} - {{- end }} -{{- end }} diff --git a/helm-chart/sut/templates/service.yaml b/helm-chart/sut/templates/service.yaml deleted file mode 100644 index 0a1e857..0000000 --- a/helm-chart/sut/templates/service.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ include "sut.fullname" . }} - labels: - {{- include "sut.labels" . | nindent 4 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - type: {{ .Values.service.type }} - ports: - - port: {{ .Values.service.port }} - targetPort: http - protocol: TCP - name: socket - selector: - {{- include "sut.selectorLabels" . | nindent 4 }} diff --git a/helm-chart/sut/templates/serviceaccount.yaml b/helm-chart/sut/templates/serviceaccount.yaml deleted file mode 100644 index 3e9368c..0000000 --- a/helm-chart/sut/templates/serviceaccount.yaml +++ /dev/null @@ -1,13 +0,0 @@ -{{- if .Values.serviceAccount.create -}} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "sut.serviceAccountName" . }} - labels: - {{- include "sut.labels" . | nindent 4 }} - {{- with .Values.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -automountServiceAccountToken: {{ .Values.serviceAccount.automount }} -{{- end }} diff --git a/helm-chart/sut/templates/tests/test-connection.yaml b/helm-chart/sut/templates/tests/test-connection.yaml deleted file mode 100644 index d506eb5..0000000 --- a/helm-chart/sut/templates/tests/test-connection.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: "{{ include "sut.fullname" . }}-test-connection" - labels: - {{- include "sut.labels" . | nindent 4 }} - annotations: - "helm.sh/hook": test -spec: - containers: - - name: wget - image: busybox - command: ['wget'] - args: ['{{ include "sut.fullname" . }}:{{ .Values.service.port }}'] - restartPolicy: Never diff --git a/helm-chart/sut/values.yaml.tmpl b/helm-chart/sut/values.yaml.tmpl deleted file mode 100644 index f5d7123..0000000 --- a/helm-chart/sut/values.yaml.tmpl +++ /dev/null @@ -1,144 +0,0 @@ -# Default values for sut. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -replicaCount: 1 - -image: - repository: harbor.4pd.io/lab-platform/inf/python - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - tag: 3.9 - -imagePullSecrets: [] -nameOverride: "" -fullnameOverride: "" - -serviceAccount: - # Specifies whether a service account should be created - create: true - # Automatically mount a ServiceAccount's API credentials? - automount: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - -podAnnotations: {} -podLabels: {} -podSecurityContext: {} - # fsGroup: 2000 - -securityContext: {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 - -service: - type: ClusterIP - port: 80 - -ingress: - enabled: false - className: "" - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - hosts: - - host: chart-example.local - paths: - - path: / - pathType: ImplementationSpecific - tls: [] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - -resources: - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - limits: - cpu: 1000m - memory: 4096Mi - requests: - cpu: 1000m - memory: 4096Mi - -autoscaling: - enabled: false - minReplicas: 1 - maxReplicas: 100 - targetCPUUtilizationPercentage: 80 - # targetMemoryUtilizationPercentage: 80 - -# Additional volumes on the output Deployment definition. -volumes: [] -# - name: foo -# secret: -# secretName: mysecret -# optional: false - -# Additional volumeMounts on the output Deployment definition. -volumeMounts: [] -# - name: foo -# mountPath: "/etc/foo" -# readOnly: true - -nodeSelector: - contest.4pd.io/accelerator: iluvatar-BI-V100 - -tolerations: - - key: hosttype - operator: Equal - value: iluvatar - effect: NoSchedule - - -affinity: {} - -readinessProbe: - failureThreshold: 1000 - httpGet: - path: /health - port: 80 - scheme: HTTP - -#readinessProbe: -# httpGet: -# path: /health -# port: 80 -# scheme: HTTP -# initialDelaySeconds: 5 # 应用启动后等待 5 秒再开始探测 -# failureThreshold: 5 # 连续失败 3 次后标记为未就绪 -# successThreshold: 1 # 连续成功 1 次后标记为就绪 - -env: - - name: TZ - value: Asia/Shanghai - - name: MY_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: MY_POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: MY_NODE_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - -#command: '' - - -priorityclassname: '' diff --git a/local_test.py b/local_test.py deleted file mode 100644 index 323fc39..0000000 --- a/local_test.py +++ /dev/null @@ -1,64 +0,0 @@ -import os -import tempfile -import shutil - -if os.path.exists("/tmp/submit_private"): - shutil.rmtree("/tmp/submit_private") - -with tempfile.TemporaryDirectory() as tempdir: - config_path = os.path.join(tempdir, "config.json") - - assert not os.system(f"ssh-keygen -f {tempdir}/ssh-key-ecdsa -t ecdsa -b 521 -q -N \"\"") - - config = """ - model: whisper - model_key: whisper - config.json: - name: 'faster-whisper-server:latest' - support_devices: - - cpu - model_path: '' - port: 8080 - other_ports: [] - other_ports_count: 1 - entrypoint: start.bat - MIN_CHUNK: 2.5 - MIN_ADD_CHUNK: 2.5 - COMPUTE_TYPE: int8 - NUM_WORKERS: 1 - CPU_THREADS: 2 - BEAM_SIZE: 5 - BATCH: 1 - LANG: auto - DEVICE: cpu - CHUNK_LENGTH: 5 - CLASS_MODEL: ./models/faster-whisper-base - EN_MODEL: ./models/faster-whisper-base - ZH_MODEL: ./models/faster-whisper-base - RU_MODEL: ./models/faster-whisper-base - PT_MODEL: ./models/faster-whisper-base - AR_MODEL: ./models/faster-whisper-base - NEW_VERSION: 1 - NEED_RESET: 0 - leaderboard_options: - nfs: - - name: whisper - srcRelativePath: leaderboard/pc_asr/en.tar.gz - mountPoint: /tmp - source: ceph_customer - """ - - with open(config_path, "w") as f: - f.write(config) - - os.environ["SSH_KEY_DIR"] = tempdir - os.environ["SUBMIT_CONFIG_FILEPATH"] = config_path - os.environ["MODEL_MAPPING"] = '{"whisper": "edge-ml.tar.gz"}' - - from run_async_a10 import get_sut_url_windows - - - print(get_sut_url_windows()) - - import time - time.sleep(3600) \ No newline at end of file diff --git a/mock_env.sh b/mock_env.sh deleted file mode 100644 index d14fd85..0000000 --- a/mock_env.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -export DATASET_FILEPATH=dataset/formatted1/de.zip -export RESULT_FILEPATH=out/result.json -export DETAILED_CASES_FILEPATH=out/detail_cases.json -export SUBMIT_CONFIG_FILEPATH= -export BENCHMARK_NAME= -export MY_POD_IP=127.0.0.1 diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 4959125..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,24 +0,0 @@ -[tool.black] -line-length = 80 -target-version = ['py39'] - -[tool.flake8] -max-line-length = 88 -count=true -per-file-ignores="./annotation/manager.py:F401" -exclude=["./label", "__pycache__", "./migrations", "./logs", "./pids", "./resources"] -ignore=["W503", "E203"] -enable-extensions="G" -application-import-names=["flake8-isort", "flake8-logging-format", "flake8-builtins"] -import-order-style="edited" -extend-ignore = ["E203", "E701"] - -[tool.isort] -py_version=39 -profile="black" -multi_line_output=9 -line_length=80 -group_by_package=true -case_sensitive=true -skip_gitignore=true - diff --git a/run.py b/run.py deleted file mode 100644 index 08b7d5e..0000000 --- a/run.py +++ /dev/null @@ -1,114 +0,0 @@ -import gc -import json -import os -import sys -import time -import zipfile - -import yaml -from schemas.context import ASRContext -from utils.client import Client -from utils.evaluator import BaseEvaluator -from utils.logger import logger -from utils.service import register_sut - -IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", None) is None -UNIT_TEST = os.getenv("UNIT_TEST", 0) - - -def main(): - logger.info("执行……") - - dataset_filepath = os.getenv( - "DATASET_FILEPATH", - "./tests/resources/en.zip", - ) - submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "./tests/resources/submit_config") - result_filepath = os.getenv("RESULT_FILEPATH", "./out/result") - bad_cases_filepath = os.getenv("BAD_CASES_FILEPATH", "./out/badcase") - detail_cases_filepath = os.getenv("DETAILED_CASES_FILEPATH", "./out/detailcase.jsonl") - - resource_name = os.getenv("BENCHMARK_NAME") - - # 提交配置 & 启动被测服务 - if os.getenv("DATASET_FILEPATH", ""): - from utils.helm import resource_check - - with open(submit_config_filepath, "r") as fp: - st_config = yaml.safe_load(fp) - st_config["values"] = resource_check(st_config.get("values", {})) - if 'docker_images' in st_config: - sut_url = "ws://172.26.1.75:9827" - os.environ['test'] = '1' - elif 'docker_image' in st_config: - sut_url = register_sut(st_config, resource_name) - elif UNIT_TEST: - sut_url = "ws://172.27.231.36:80" - else: - logger.error("config 配置错误,没有 docker_image") - os._exit(1) - else: - os.environ['test'] = '1' - sut_url = "ws://172.27.231.36:80" - if UNIT_TEST: - exit(0) - - """ - # 数据集处理 - local_dataset_path = "./dataset" - os.makedirs(local_dataset_path, exist_ok=True) - with zipfile.ZipFile(dataset_filepath) as zf: - zf.extractall(local_dataset_path) - config_path = os.path.join(local_dataset_path, "data.yaml") - with open(config_path, "r") as fp: - dataset_config = yaml.safe_load(fp) - - # 数据集信息 - dataset_global_config = dataset_config.get("global", {}) - dataset_query = dataset_config.get("query_data", {}) - - evaluator = BaseEvaluator() - - # 开始预测 - for idx, query_item in enumerate(dataset_query): - gc.collect() - logger.info(f"开始执行 {idx} 条数据") - - context = ASRContext(**dataset_global_config) - context.lang = query_item.get("lang", context.lang) - context.file_path = os.path.join(local_dataset_path, query_item["file"]) - # context.audio_length = query_item["audio_length"] - - interactions = Client(sut_url, context).action() - context.append_labels(query_item["voice"]) - context.append_preds( - interactions["predict_data"], - interactions["send_time"], - interactions["recv_time"], - ) - context.fail = interactions["fail"] - if IN_TEST: - with open('output.txt', 'w') as fp: - original_stdout = sys.stdout - sys.stdout = fp - print(context) - sys.stdout = original_stdout - evaluator.evaluate(context) - detail_case = evaluator.gen_detail_case() - with open(detail_cases_filepath, "a") as fp: - fp.write(json.dumps(detail_case.to_dict(), ensure_ascii=False) + "\n") - time.sleep(4) - - evaluator.post_evaluate() - output_result = evaluator.gen_result() - # print(evaluator.__dict__) - logger.info("执行完成. Result = {output_result}") - - with open(result_filepath, "w") as fp: - json.dump(output_result, fp, indent=2, ensure_ascii=False) - with open(bad_cases_filepath, "w") as fp: - fp.write("当前榜单不存在 Bad Case\n") - """ - -if __name__ == "__main__": - main() diff --git a/run_async_a10.py b/run_async_a10.py deleted file mode 100644 index df77568..0000000 --- a/run_async_a10.py +++ /dev/null @@ -1,757 +0,0 @@ -import atexit -import concurrent.futures -import fcntl -import gc -import glob -import json -import os -import random -import signal -import sys -import tempfile -import threading -import time -import zipfile -from concurrent.futures import ThreadPoolExecutor - -import yaml -from fabric import Connection -from vmplatform import VMOS, Client, VMDataDisk - -from schemas.context import ASRContext -from utils.client_async import ClientAsync -from utils.evaluator import BaseEvaluator -from utils.logger import logger -from utils.service import register_sut - -IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", None) is None -UNIT_TEST = os.getenv("UNIT_TEST", 0) - -DATASET_NUM = os.getenv("DATASET_NUM") - -# vm榜单参数 -SUT_TYPE = os.getenv("SUT_TYPE", "kubernetes") -SHARE_SUT = os.getenv("SHARE_SUT", "true") == "true" -VM_ID = 0 -VM_IP = "" -do_deploy_chart = True -VM_CPU = int(os.getenv("VM_CPU", "2")) -VM_MEM = int(os.getenv("VM_MEM", "4096")) -MODEL_BASEPATH = os.getenv("MODEL_BASEPATH", "/tmp/customer/leaderboard/pc_asr") -MODEL_MAPPING = json.loads(os.getenv("MODEL_MAPPING", "{}")) -SSH_KEY_DIR = os.getenv("SSH_KEY_DIR", "/workspace") -SSH_PUBLIC_KEY_FILE = os.path.join(SSH_KEY_DIR, "ssh-key-ecdsa.pub") -SSH_KEY_FILE = os.path.join(SSH_KEY_DIR, "ssh-key-ecdsa") - -CONNECT_KWARGS = {"key_filename": SSH_KEY_FILE} - -# 共享sut参数 -JOB_ID = os.getenv("JOB_ID") -dirname = "/tmp/submit_private/sut_share" -os.makedirs(dirname, exist_ok=True) -SUT_SHARE_LOCK = os.path.join(dirname, "lock.lock") -SUT_SHARE_USE_LOCK = os.path.join(dirname, "use.lock") -SUT_SHARE_STATUS = os.path.join(dirname, "status.json") -SUT_SHARE_JOB_STATUS = os.path.join(dirname, f"job_status.{JOB_ID}") -SUT_SHARE_PUBLIC_FAIL = os.path.join(dirname, "one_job_failed") -fd_lock = open(SUT_SHARE_USE_LOCK, "a") - - -def clean_vm_atexit(): - global VM_ID, do_deploy_chart - if not VM_ID: - return - if not do_deploy_chart: - return - logger.info("删除vm") - vmclient = Client() - err_msg = vmclient.delete_vm(VM_ID) - if err_msg: - logger.warning(f"删除vm失败: {err_msg}") - - -def put_file_to_vm(c: Connection, local_path: str, remote_path: str): - logger.info(f"uploading file {local_path} to {remote_path}") - result = c.put(local_path, remote_path) - logger.info("uploaded {0.local} to {0.remote}".format(result)) - - -def deploy_windows_sut(): - global VM_ID - global VM_IP - - submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "") - with open(submit_config_filepath, "r") as fp: - st_config = yaml.safe_load(fp) - assert "model" in st_config, "未配置model" - assert "model_key" in st_config, "未配置model_key" - assert "config.json" in st_config, "未配置config.json" - nfs = st_config.get("leaderboard_options", {}).get("nfs", []) - assert len(nfs) > 0, "未配置nfs" - assert st_config["model"] in MODEL_MAPPING, "提交模型不在可用模型范围内" - - model = st_config["model"] - model_key = st_config["model_key"] - model_path = "" - config = st_config["config.json"] - exist = False - for nfs_item in nfs: - if nfs_item["name"] == model_key: - exist = True - if nfs_item["source"] == "ceph_customer": - model_path = os.path.join( - "/tmp/customer", - nfs_item["srcRelativePath"], - ) - else: - model_path = os.path.join( - "/tmp/juicefs", - nfs_item["srcRelativePath"], - ) - break - if not exist: - raise RuntimeError(f"未找到nfs配置项 name={model_key}") - config_path = os.path.join(tempfile.mkdtemp(), "config.json") - model_dir = os.path.basename(model_path).split(".")[0] - config["model_path"] = f"E:\\model\\{model_dir}" - with open(config_path, "w") as fp: - json.dump(config, fp, ensure_ascii=False, indent=4) - - vmclient = Client() - with open(SSH_PUBLIC_KEY_FILE, "r") as fp: - sshpublickey = fp.read().rstrip() - VM_ID = vmclient.create_vm( - "amd64", - VMOS.windows10, - VM_CPU, - VM_MEM, - "leaderboard-%s-submit-%s-job-%s" - % ( - os.getenv("BENCHMARK_NAME"), - os.getenv("SUBMIT_ID"), - os.getenv("JOB_ID"), - ), - sshpublickey, - datadisks=[ - VMDataDisk( - size=50, - disk_type="ssd", - mount_path="/", - filesystem="NTFS", - ) - ], - ) - atexit.register(clean_vm_atexit) - signal.signal(signal.SIGTERM, lambda signum, _: sys.exit(signum)) - VM_IP = vmclient.wait_until_vm_running(VM_ID) - logger.info("vm created successfully, vm_ip: %s", VM_IP) - - def sut_startup(): - with Connection( - VM_IP, - "administrator", - connect_kwargs=CONNECT_KWARGS, - ) as c: - script_path = "E:\\base\\asr\\faster-whisper\\server" - script_path = "E:\\install\\asr\\sensevoice\\server" - bat_filepath = f"{script_path}\\start.bat" - config_filepath = "E:\\submit\\config.json" - result = c.run("") - assert result.ok - c.run( - f'cd /d {script_path} & set "EDGE_ML_ENV_HOME=E:\\install" & {bat_filepath} {config_filepath}', - warn=True, - ) - - with Connection( - VM_IP, - "administrator", - connect_kwargs=CONNECT_KWARGS, - ) as c: - model_filepath = os.path.join(MODEL_BASEPATH, MODEL_MAPPING[model]) - filename = os.path.basename(model_filepath) - put_file_to_vm(c, model_filepath, "/E:/") - - result = c.run("mkdir E:\\base") - assert result.ok - result = c.run("mkdir E:\\model") - assert result.ok - result = c.run("mkdir E:\\submit") - assert result.ok - - result = c.run( - f"tar zxvf E:\\{filename} -C E:\\base --strip-components 1" - ) - assert result.ok - - result = c.run("E:\\base\\setup-win.bat E:\\install") - assert result.ok - - put_file_to_vm(c, config_path, "/E:/submit") - put_file_to_vm(c, model_path, "/E:/model") - result = c.run( - f"tar zxvf E:\\model\\{os.path.basename(model_path)} -C E:\\model" - ) - assert result.ok - threading.Thread(target=sut_startup, daemon=True).start() - time.sleep(60) - - return f"ws://{VM_IP}:{config['port']}" - - -def deploy_macos_sut(): - global VM_ID - global VM_IP - - submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "") - with open(submit_config_filepath, "r") as fp: - st_config = yaml.safe_load(fp) - assert "model" in st_config, "未配置model" - assert "model_key" in st_config, "未配置model_key" - assert "config.json" in st_config, "未配置config.json" - nfs = st_config.get("leaderboard_options", {}).get("nfs", []) - assert len(nfs) > 0, "未配置nfs" - assert st_config["model"] in MODEL_MAPPING, "提交模型不在可用模型范围内" - - model = st_config["model"] - model_key = st_config["model_key"] - model_path = "" - config = st_config["config.json"] - exist = False - for nfs_item in nfs: - if nfs_item["name"] == model_key: - exist = True - if nfs_item["source"] == "ceph_customer": - model_path = os.path.join( - "/tmp/customer", - nfs_item["srcRelativePath"], - ) - else: - model_path = os.path.join( - "/tmp/juicefs", - nfs_item["srcRelativePath"], - ) - break - if not exist: - raise RuntimeError(f"未找到nfs配置项 name={model_key}") - config_path = os.path.join(tempfile.mkdtemp(), "config.json") - model_dir = os.path.basename(model_path).split(".")[0] - - vmclient = Client() - with open(SSH_PUBLIC_KEY_FILE, "r") as fp: - sshpublickey = fp.read().rstrip() - VM_ID = vmclient.create_vm( - "amd64", - VMOS.macos12, - VM_CPU, - VM_MEM, - "leaderboard-%s-submit-%s-job-%s" - % ( - os.getenv("BENCHMARK_NAME"), - os.getenv("SUBMIT_ID"), - os.getenv("JOB_ID"), - ), - sshpublickey, - datadisks=[ - VMDataDisk( - size=50, - disk_type="ssd", - mount_path="/", - filesystem="apfs", - ) - ], - ) - atexit.register(clean_vm_atexit) - signal.signal(signal.SIGTERM, lambda signum, _: sys.exit(signum)) - VM_IP = vmclient.wait_until_vm_running(VM_ID) - logger.info("vm created successfully, vm_ip: %s", VM_IP) - - with Connection( - VM_IP, - "admin", - connect_kwargs=CONNECT_KWARGS, - ) as c: - result = c.run("ls -d /Volumes/data*") - assert result.ok - volume_path = result.stdout.strip() - - config["model_path"] = f"{volume_path}/model/{model_dir}" - with open(config_path, "w") as fp: - json.dump(config, fp, ensure_ascii=False, indent=4) - - def sut_startup(): - with Connection( - VM_IP, - "admin", - connect_kwargs=CONNECT_KWARGS, - ) as c: - script_path = f"{volume_path}/install/asr/sensevoice/server" - startsh = f"{script_path}/start.sh" - config_filepath = f"{volume_path}/submit/config.json" - c.run( - f"cd {script_path} && sh {startsh} {config_filepath}", - warn=True, - ) - - with Connection( - VM_IP, - "admin", - connect_kwargs=CONNECT_KWARGS, - ) as c: - model_filepath = os.path.join(MODEL_BASEPATH, MODEL_MAPPING[model]) - filename = os.path.basename(model_filepath) - put_file_to_vm(c, model_filepath, f"{volume_path}") - - result = c.run(f"mkdir {volume_path}/base") - assert result.ok - result = c.run(f"mkdir {volume_path}/model") - assert result.ok - result = c.run(f"mkdir {volume_path}/submit") - assert result.ok - - result = c.run( - f"tar zxvf {volume_path}/{filename} -C {volume_path}/base --strip-components 1" # noqa: E501 - ) - assert result.ok - - result = c.run( - f"sh {volume_path}/base/setup-mac.sh {volume_path}/install x64" - ) - assert result.ok - - put_file_to_vm(c, config_path, f"{volume_path}/submit") - put_file_to_vm(c, model_path, f"{volume_path}/model") - result = c.run( - f"tar zxvf {volume_path}/model/{os.path.basename(model_path)} -C {volume_path}/model" # noqa: E501 - ) - assert result.ok - threading.Thread(target=sut_startup, daemon=True).start() - time.sleep(60) - - return f"ws://{VM_IP}:{config['port']}" - - -def get_sut_url_vm(vm_type: str): - global VM_ID - global VM_IP - global do_deploy_chart - - do_deploy_chart = True - # 拉起SUT - - def check_job_failed(): - while True: - time.sleep(30) - if os.path.exists(SUT_SHARE_PUBLIC_FAIL): - logger.error("there is a job failed in current submit") - sys.exit(1) - - sut_url = "" - threading.Thread(target=check_job_failed, daemon=True).start() - if SHARE_SUT: - - time.sleep(10 * random.random()) - try: - open(SUT_SHARE_LOCK, "x").close() - except Exception: - do_deploy_chart = False - - start_at = time.time() - - def file_last_updated_at(file: str): - return os.stat(file).st_mtime if os.path.exists(file) else start_at - - if not do_deploy_chart: - with open(SUT_SHARE_JOB_STATUS, "w") as f: - f.write("waiting") - while ( - time.time() - file_last_updated_at(SUT_SHARE_STATUS) - <= 60 * 60 * 24 - ): - logger.info( - "Waiting sut application to be deployed by another job" - ) - time.sleep(10 + random.random()) - if os.path.exists(SUT_SHARE_STATUS): - get_status = False - for _ in range(10): - try: - with open(SUT_SHARE_STATUS, "r") as f: - status = json.load(f) - get_status = True - break - except Exception: - time.sleep(1 + random.random()) - continue - if not get_status: - raise RuntimeError( - "Failed to get status of sut application" - ) - assert ( - status.get("status") != "failed" - ), "Failed to deploy sut application, \ -please check other job logs" - if status.get("status") == "running": - VM_ID = status.get("vmid") - VM_IP = status.get("vmip") - sut_url = status.get("sut_url") - with open(SSH_PUBLIC_KEY_FILE, "w") as fp: - fp.write(status.get("pubkey")) - with open(SSH_KEY_FILE, "w") as fp: - fp.write(status.get("prikey")) - logger.info("Successfully get deployed sut application") - break - - if do_deploy_chart: - try: - fcntl.flock(fd_lock, fcntl.LOCK_EX) - with open(SUT_SHARE_JOB_STATUS, "w") as f: - f.write("waiting") - pending = True - - def update_status(): - while pending: - time.sleep(30) - if not pending: - break - with open(SUT_SHARE_STATUS, "w") as f: - json.dump({"status": "pending"}, f) - - threading.Thread(target=update_status, daemon=True).start() - if vm_type == "windows": - sut_url = deploy_windows_sut() - else: - sut_url = deploy_macos_sut() - except Exception: - open(SUT_SHARE_PUBLIC_FAIL, "w").close() - with open(SUT_SHARE_STATUS, "w") as f: - json.dump({"status": "failed"}, f) - raise - finally: - pending = False - with open(SUT_SHARE_STATUS, "w") as f: - pubkey = "" - with open(SSH_PUBLIC_KEY_FILE, "r") as fp: - pubkey = fp.read().rstrip() - prikey = "" - with open(SSH_KEY_FILE, "r") as fp: - prikey = fp.read() - json.dump( - { - "status": "running", - "vmid": VM_ID, - "vmip": VM_IP, - "pubkey": pubkey, - "sut_url": sut_url, - "prikey": prikey, - }, - f, - ) - else: - while True: - time.sleep(5 + random.random()) - try: - fcntl.flock(fd_lock, fcntl.LOCK_EX | fcntl.LOCK_NB) - break - except Exception: - logger.info("尝试抢占调用sut失败,继续等待 5s ...") - - with open(SUT_SHARE_JOB_STATUS, "w") as f: - f.write("running") - - return sut_url - - -def get_sut_url(): - if SUT_TYPE in ("windows", "macos"): - return get_sut_url_vm(SUT_TYPE) - - submit_config_filepath = os.getenv( - "SUBMIT_CONFIG_FILEPATH", "./tests/resources/submit_config" - ) - CPU = os.getenv("SUT_CPU", "2") - MEMORY = os.getenv("SUT_MEMORY", "4Gi") - resource_name = os.getenv("BENCHMARK_NAME") - - # 任务信息 - # 斯拉夫语族:俄语、波兰语 - # 日耳曼语族:英语、德语、荷兰语 - # 拉丁语族(罗曼语族):西班牙语、葡萄牙语、法国语、意大利语 - # 闪米特语族:阿拉伯语、希伯来语 - - # 提交配置 & 启动被测服务 - if os.getenv("DATASET_FILEPATH", ""): - with open(submit_config_filepath, "r") as fp: - st_config = yaml.safe_load(fp) - if "values" not in st_config: - st_config["values"] = {} - st_config["values"]["resources"] = {} - st_config["values"]["resources"]["limits"] = {} - st_config["values"]["resources"]["limits"]["cpu"] = CPU - st_config["values"]["resources"]["limits"]["memory"] = MEMORY - # st_config["values"]['resources']['limits']['nvidia.com/gpu'] = '1' - # st_config["values"]['resources']['limits']['nvidia.com/gpumem'] = "1843" - # st_config["values"]['resources']['limits']['nvidia.com/gpucores'] = "8" - st_config["values"]["resources"]["requests"] = {} - st_config["values"]["resources"]["requests"]["cpu"] = CPU - st_config["values"]["resources"]["requests"]["memory"] = MEMORY - # st_config["values"]['resources']['requests']['nvidia.com/gpu'] = '1' - # st_config["values"]['resources']['requests']['nvidia.com/gpumem'] = "1843" - # st_config["values"]['resources']['requests']['nvidia.com/gpucores'] = "8" - # st_config['values']['nodeSelector'] = {} - # st_config["values"]["nodeSelector"][ - # "contest.4pd.io/accelerator" - # ] = "A10vgpu" - # st_config['values']['tolerations'] = [] - # toleration_item = {} - # toleration_item['key'] = 'hosttype' - # toleration_item['operator'] = 'Equal' - # toleration_item['value'] = 'vgpu' - # toleration_item['effect'] = 'NoSchedule' - # st_config['values']['tolerations'].append(toleration_item) - if os.getenv("RESOURCE_TYPE", "cpu") == "cpu": - values = st_config["values"] - limits = values.get("resources", {}).get("limits", {}) - requests = values.get("resources", {}).get("requests", {}) - if ( - "nvidia.com/gpu" in limits - or "nvidia.com/gpumem" in limits - or "nvidia.com/gpucores" in limits - or "nvidia.com/gpu" in requests - or "nvidia.com/gpumem" in requests - or "nvidia.com/gpucores" in requests - ): - raise Exception("禁止使用GPU!") - else: - vgpu_num = int(os.getenv("SUT_VGPU", "3")) - st_config["values"]["resources"]["limits"]["nvidia.com/gpu"] = ( - str(vgpu_num) - ) - st_config["values"]["resources"]["limits"][ - "nvidia.com/gpumem" - ] = str(1843 * vgpu_num) - st_config["values"]["resources"]["limits"][ - "nvidia.com/gpucores" - ] = str(8 * vgpu_num) - st_config["values"]["resources"]["requests"][ - "nvidia.com/gpu" - ] = str(vgpu_num) - st_config["values"]["resources"]["requests"][ - "nvidia.com/gpumem" - ] = str(1843 * vgpu_num) - st_config["values"]["resources"]["requests"][ - "nvidia.com/gpucores" - ] = str(8 * vgpu_num) - st_config["values"]["nodeSelector"] = {} - st_config["values"]["nodeSelector"][ - "contest.4pd.io/accelerator" - ] = "A10vgpu" - st_config["values"]["tolerations"] = [] - toleration_item = {} - toleration_item["key"] = "hosttype" - toleration_item["operator"] = "Equal" - toleration_item["value"] = "vgpu" - toleration_item["effect"] = "NoSchedule" - st_config["values"]["tolerations"].append(toleration_item) - if "docker_images" in st_config: - sut_url = "ws://172.26.1.75:9827" - os.environ["test"] = "1" - elif "docker_image" in st_config: - sut_url = register_sut(st_config, resource_name) - elif UNIT_TEST: - sut_url = "ws://172.27.231.36:80" - else: - logger.error("config 配置错误,没有 docker_image") - os._exit(1) - return sut_url - else: - os.environ["test"] = "1" - sut_url = "ws://172.27.231.36:80" - sut_url = "ws://172.26.1.75:9827" - return sut_url - - -def load_merge_dataset(dataset_filepath: str) -> dict: - local_dataset_path = "./dataset" - os.makedirs(local_dataset_path, exist_ok=True) - with zipfile.ZipFile(dataset_filepath) as zf: - zf.extractall(local_dataset_path) - - config = {} - sub_datasets = os.listdir(local_dataset_path) - for sub_dataset in sub_datasets: - if sub_dataset.startswith("asr."): - lang = sub_dataset[4:] - lang_path = os.path.join(local_dataset_path, lang) - os.makedirs(lang_path, exist_ok=True) - with zipfile.ZipFile( - os.path.join(local_dataset_path, sub_dataset) - ) as zf: - zf.extractall(lang_path) - lang_config_path = os.path.join(lang_path, "data.yaml") - with open(lang_config_path, "r") as fp: - lang_config = yaml.safe_load(fp) - audio_lengths = {} - for query_item in lang_config.get("query_data", []): - audio_path = os.path.join( - lang_path, - query_item["file"], - ) - query_item["file"] = audio_path - audio_lengths[query_item["file"]] = os.path.getsize( - audio_path, - ) - lang_config["query_data"] = sorted( - lang_config.get("query_data", []), - key=lambda x: audio_lengths[x["file"]], - reverse=True, - ) - - idx = 0 - length = 0.0 - for query_item in lang_config["query_data"]: - audio_length = audio_lengths[query_item["file"]] - length += audio_length / 32000 - idx += 1 - # 每个语言限制半个小时长度 - if length >= 30 * 60: - break - - lang_config["query_data"] = lang_config["query_data"][:idx] - config[lang] = lang_config - - config["query_data"] = [] - for lang, lang_config in config.items(): - if lang == "query_data": - continue - for query_item in lang_config["query_data"]: - config["query_data"].append( - { - **query_item, - "lang": lang, - } - ) - random.Random(0).shuffle(config["query_data"]) - - return config - - -def postprocess_failed(): - open(SUT_SHARE_PUBLIC_FAIL, "w").close() - - -def main(): - dataset_filepath = os.getenv( - "DATASET_FILEPATH", - "/Users/4paradigm/Projects/dataset/asr/de.zip", - # "./tests/resources/en.zip", - ) - result_filepath = os.getenv("RESULT_FILEPATH", "./out/result") - bad_cases_filepath = os.getenv("BAD_CASES_FILEPATH", "./out/badcase") - detail_cases_filepath = os.getenv( - "DETAILED_CASES_FILEPATH", "./out/detailcase.jsonl" - ) - thread_num = int(os.getenv("THREAD_NUM", "1")) - - # 数据集处理 - config = {} - if os.getenv("MERGE_DATASET", "1"): - config = load_merge_dataset(dataset_filepath) - dataset_query = config["query_data"] - else: - local_dataset_path = "./dataset" - os.makedirs(local_dataset_path, exist_ok=True) - with zipfile.ZipFile(dataset_filepath) as zf: - zf.extractall(local_dataset_path) - config_path = os.path.join(local_dataset_path, "data.yaml") - with open(config_path, "r") as fp: - dataset_config = yaml.safe_load(fp) - # 读取所有的音频,进而获得音频的总长度,最后按照音频长度对 query_data 进行降序排序 - lang = os.getenv("lang") - if lang is None: - lang = dataset_config.get("global", {}).get("lang", "en") - audio_lengths = [] - for query_item in dataset_config.get("query_data", []): - query_item["lang"] = lang - audio_path = os.path.join(local_dataset_path, query_item["file"]) - query_item["file"] = audio_path - audio_lengths.append(os.path.getsize(audio_path) / 1024 / 1024) - dataset_config["query_data"] = sorted( - dataset_config.get("query_data", []), - key=lambda x: audio_lengths[dataset_config["query_data"].index(x)], - reverse=True, - ) - # 数据集信息 - # dataset_global_config = dataset_config.get("global", {}) - dataset_query = dataset_config.get("query_data", {}) - config[lang] = dataset_config - - # sut url - sut_url = get_sut_url() - - try: - # 开始测试 - logger.info("开始执行") - evaluator = BaseEvaluator() - future_list = [] - with ThreadPoolExecutor(max_workers=thread_num) as executor: - for idx, query_item in enumerate(dataset_query): - context = ASRContext( - **config[query_item["lang"]].get("global", {}), - ) - context.lang = query_item["lang"] - context.file_path = query_item["file"] - context.append_labels(query_item["voice"]) - future = executor.submit( - ClientAsync(sut_url, context, idx).action - ) - future_list.append(future) - for future in concurrent.futures.as_completed(future_list): - context = future.result() - evaluator.evaluate(context) - detail_case = evaluator.gen_detail_case() - with open(detail_cases_filepath, "a") as fp: - fp.write( - json.dumps( - detail_case.to_dict(), - ensure_ascii=False, - ) - + "\n", - ) - del context - gc.collect() - - evaluator.post_evaluate() - output_result = evaluator.gen_result() - logger.info("执行完成") - - with open(result_filepath, "w") as fp: - json.dump(output_result, fp, indent=2, ensure_ascii=False) - with open(bad_cases_filepath, "w") as fp: - fp.write("当前榜单不存在 Bad Case\n") - - if SHARE_SUT: - with open(SUT_SHARE_JOB_STATUS, "w") as f: - f.write("success") - - fcntl.flock(fd_lock, fcntl.LOCK_UN) - fd_lock.close() - while SHARE_SUT and do_deploy_chart: - time.sleep(30) - success_num = 0 - for job_status_file in glob.glob(dirname + "/job_status.*"): - with open(job_status_file, "r") as f: - job_status = f.read() - success_num += job_status == "success" - if success_num == int(DATASET_NUM): - break - logger.info("Waiting for all jobs to complete") - except Exception: - if SHARE_SUT: - postprocess_failed() - raise - sys.exit(0) - - -if __name__ == "__main__": - main() diff --git a/schemas/__init__.py b/schemas/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/schemas/context.py b/schemas/context.py deleted file mode 100644 index adff8d6..0000000 --- a/schemas/context.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -from copy import deepcopy -from typing import Dict, List, Optional - -from pydantic import BaseModel, Field - -from schemas.stream import StreamDataModel - - -class LabelContext(BaseModel): - start: float - end: float - answer: str - - -class PredContext(BaseModel): - recognition_results: StreamDataModel - recv_time: Optional[float] = Field(None) - send_time: Optional[float] = Field(None) - - -class ASRContext: - def __init__(self, **kwargs): - self.bits = kwargs.get("bits", 16) - self.channel = kwargs.get("channel", 1) - self.sample_rate = kwargs.get("sample_rate", 16000) - self.audio_format = kwargs.get("format", "wav") - self.enable_words = kwargs.get("enable_words", True) - self.char_contains_rate = kwargs.get("char_contains_rate", 0.8) - self.lang = os.getenv("lang") - if self.lang is None: - self.lang = kwargs.get("lang", "en") - self.stream = kwargs.get("stream", True) - - self.wait_time = float(os.getenv("wait_time", 0.1)) - self.chunk_size = self.sample_rate * self.bits / 8 * self.wait_time - if int(os.getenv('chunk_size_set', 0)): - self.chunk_size = int(os.getenv('chunk_size_set', 0)) - - self.audio_length = 0 - self.file_path = "" - - self.labels: List[LabelContext] = kwargs.get("labels", []) - self.preds: List[PredContext] = kwargs.get("preds", []) - - self.label_sentences: List[str] = [] - self.pred_sentences: List[str] = [] - - self.send_time_start_end = [] - self.recv_time_start_end = [] - - self.fail = False - self.fail_char_contains_rate_num = 0 - - self.punctuation_num = 0 - self.pred_punctuation_num = 0 - - def append_labels(self, voices: List[Dict]): - for voice_data in voices: - label_context = LabelContext(**voice_data) - self.labels.append(label_context) - - def append_preds( - self, - predict_data: List[StreamDataModel], - send_time: List[float], - recv_time: List[float], - ): - self.send_time_start_end = [send_time[0], send_time[-1]] if len(send_time) > 0 else [] - self.recv_time_start_end = [recv_time[0], recv_time[-1]] if len(recv_time) > 0 else [] - for pred_item, send_time_item, recv_time_item in zip(predict_data, send_time, recv_time): - pred_item = deepcopy(pred_item) - pred_context = PredContext(recognition_results=pred_item.model_dump()) - pred_context.send_time = send_time_item - pred_context.recv_time = recv_time_item - self.preds.append(pred_context) - - def to_dict(self): - return { - "bits": self.bits, - "channel": self.channel, - "sample_rate": self.sample_rate, - "audio_format": self.audio_format, - "enable_words": self.enable_words, - "stream": self.stream, - "wait_time": self.wait_time, - "chunk_size": self.chunk_size, - "labels": [item.model_dump_json() for item in self.labels], - "preds": [item.model_dump_json() for item in self.preds], - } diff --git a/schemas/dataset.py b/schemas/dataset.py deleted file mode 100644 index 2c940c7..0000000 --- a/schemas/dataset.py +++ /dev/null @@ -1,18 +0,0 @@ -from typing import List - -from pydantic import BaseModel, Field - - -class QueryDataSentence(BaseModel): - answer: str = Field(description="文本label") - start: float = Field(description="句子开始时间") - end: float = Field(description="句子结束时间") - - -class QueryData(BaseModel): - lang: str = Field(description="语言") - file: str = Field(description="音频文件位置") - duration: float = Field(description="音频长度") - voice: List[QueryDataSentence] = Field( - description="音频文件的文本label内容" - ) diff --git a/schemas/stream.py b/schemas/stream.py deleted file mode 100644 index 44f8111..0000000 --- a/schemas/stream.py +++ /dev/null @@ -1,66 +0,0 @@ -from typing import List - -from pydantic import BaseModel, ValidationError, field_validator -from pydantic import model_validator - - -class StreamWordsModel(BaseModel): - text: str - start_time: float - end_time: float - - @model_validator(mode="after") - def check_result(self): - if self.end_time < self.start_time: - raise ValidationError("end-time 小于 start-time, error") - return self - - -class StreamDataModel(BaseModel): - text: str - language: str - final_result: bool - para_seq: int - start_time: float - end_time: float - words: List[StreamWordsModel] - - @model_validator(mode="after") - def check_result(self): - if self.end_time < self.start_time: - raise ValidationError("end-time 小于 start-time, error") - return self - - -class StreamResultModel(BaseModel): - asr_results: StreamDataModel - - @field_validator('asr_results', mode="after") - def convert_to_seconds(cls, v: StreamDataModel, values): - # 在这里处理除以1000的逻辑 - v.end_time = v.end_time / 1000 - v.start_time = v.start_time / 1000 - for word in v.words: - word.start_time /= 1000 - word.end_time /= 1000 - return v - - class Config: - validate_assignment = True - - -class NonStreamDataModel(BaseModel): - text: str - para_seq: int - start_time: float - end_time: float - - @model_validator(mode="after") - def check_result(self): - if self.end_time < self.start_time: - raise ValidationError("end-time 小于 start-time, error") - return self - - -class NonStreamResultModel(BaseModel): - contents: List[NonStreamDataModel] diff --git a/scripts/check_dataset_time.py b/scripts/check_dataset_time.py deleted file mode 100644 index ef07b9f..0000000 --- a/scripts/check_dataset_time.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import sys -from collections import defaultdict - -import yaml - - -def main(dataset_dir): - dirs = os.listdir(dataset_dir) - dirs = list( - filter(lambda x: os.path.isdir(os.path.join(dataset_dir, x)), dirs) - ) - - problem_dirs = set() - problem_count = defaultdict(int) - for dir in dirs: - with open(os.path.join(dataset_dir, dir, "data.yaml"), "r") as f: - data = yaml.full_load(f) - for query_i, query in enumerate(data["query_data"]): - voices = sorted(query["voice"], key=lambda x: x["start"]) - if voices != query["voice"]: - print("-----", dir) - if voices[0]["start"] > voices[0]["end"]: - print( - "err1: %s 第%s个query的第%d个voice的start大于end: %s" - % (dir, query_i, 0, voices[0]["answer"]) - ) - problem_dirs.add(dir) - for voice_i in range(1, len(voices)): - voice = voices[voice_i] - if voice["start"] > voice["end"]: - print( - "err1: %s 第%s个query的第%d个voice的start大于end: %s" - % (dir, query_i, voice_i, voice["answer"]) - ) - problem_dirs.add(dir) - if voice["start"] < voices[voice_i - 1]["end"]: - print( - "err2: %s 第%s个query的第%d个voice的start小于前一个voice的end: %s" - % (dir, query_i, voice_i, voice["answer"]) - ) - problem_dirs.add(dir) - problem_count[dir] += 1 - print(len(dirs)) - print(problem_dirs) - print(problem_count) - - -if __name__ == "__main__": - if len(sys.argv) < 2: - print("指定 测试数据集文件夹") - sys.exit(1) - main(sys.argv[1]) diff --git a/scripts/convert_callback_dataset.py b/scripts/convert_callback_dataset.py deleted file mode 100644 index 1f52a33..0000000 --- a/scripts/convert_callback_dataset.py +++ /dev/null @@ -1,108 +0,0 @@ -import json -import os -import shutil -import sys -import zipfile - -import yaml - -""" -target -{ - "global": { - "lang": "" - }, - "query_data": [ - "file": "", - "duration": 2.0, - "voice": [ - { - "answer": "", - "start": 0.0, - "end": 1.0 - } - ] - ] -} -""" - - -def situation_a(meta, dataset_folder, output_folder): - """ - { - "combined": { - "en": [ - { - "wav": "*.wav", - "transcriptions": [ - { - "text": "", - "start": 0.0, - "end": 1.0 - } - ], - "duration": 2.0 - } - ] - } - } - """ - meta = meta["combined"] - - for lang, arr in meta.items(): - print("processing", lang) - assert len(lang) == 2 - lang_folder = os.path.join(output_folder, lang) - os.makedirs(lang_folder, exist_ok=True) - data = {"global": {"lang": lang}, "query_data": []} - query_data = data["query_data"] - for item in arr: - os.makedirs( - os.path.join(lang_folder, os.path.dirname(item["wav"])), - exist_ok=True, - ) - mp3_file = item["wav"][:-4] + ".mp3" - shutil.copyfile( - os.path.join(dataset_folder, mp3_file), - os.path.join(lang_folder, mp3_file), - ) - query_data_item = { - "file": mp3_file, - "duration": float(item["duration"]), - "voice": [], - } - query_data.append(query_data_item) - voice = query_data_item["voice"] - for v in item["transcriptions"]: - voice.append( - { - "answer": v["text"], - "start": float(v["start"]), - "end": float(v["end"]), - } - ) - with open(os.path.join(lang_folder, "data.yaml"), "w") as f: - yaml.dump(data, f, indent=2, allow_unicode=True, encoding="utf-8") - with zipfile.ZipFile( - os.path.join(output_folder, lang + ".zip"), "w" - ) as ziper: - dirname = lang_folder - for path, _, files in os.walk(dirname): - for file in files: - ziper.write( - os.path.join(path, file), - os.path.join(path[len(dirname) :], file), - zipfile.ZIP_DEFLATED, - ) - - -if __name__ == "__main__": - if len(sys.argv) < 3: - print("指定 数据集文件夹路径 输出路径") - sys.exit(1) - dataset_folder = sys.argv[1] - output_folder = sys.argv[2] - - with open(os.path.join(dataset_folder, "meta.json")) as f: - meta = json.load(f) - situation_a(meta, dataset_folder, output_folder) diff --git a/scripts/debug_detailcase.py b/scripts/debug_detailcase.py deleted file mode 100644 index 28b43dc..0000000 --- a/scripts/debug_detailcase.py +++ /dev/null @@ -1,56 +0,0 @@ -import json -import sys - -from schemas.dataset import QueryData -from schemas.stream import StreamDataModel -from utils.evaluator_plus import evaluate_editops - - -def main(detailcase_file: str): - with open(detailcase_file) as f: - d = json.load(f)[0] - preds = d["preds"] - preds = list(map(lambda x: StreamDataModel(**x), preds)) - preds = list(filter(lambda x: x.final_result, preds)) - label = d["label"] - label = QueryData(**label) - print(evaluate_editops(label, preds)) - - -def evaluate_from_record(detailcase_file: str, record_path: str): - with open(detailcase_file) as f: - d = json.load(f)[0] - label = d["label"] - label = QueryData(**label) - with open(record_path) as f: - record = json.load(f) - tokens_pred = record["tokens_pred"] - tokens_label = record["tokens_label"] - recognition_results = record["recognition_results"] - recognition_results = list( - map(lambda x: StreamDataModel(**x), recognition_results) - ) - a, b = [], [] - for i, rr in enumerate(recognition_results): - if rr.final_result: - a.append(tokens_pred[i]) - b.append(rr) - tokens_pred = a - recognition_results = b - - print( - evaluate_editops( - label, - recognition_results, - tokens_pred, - tokens_label, - ) - ) - - -if __name__ == "__main__": - if len(sys.argv) < 2: - print("请指定 detailcase 文件路径") - sys.exit(1) - main(sys.argv[1]) - # evaluate_from_record(sys.argv[1], sys.argv[2]) diff --git a/ssh-keygen b/ssh-keygen deleted file mode 100755 index 3b76d1b..0000000 Binary files a/ssh-keygen and /dev/null differ diff --git a/starting_kit/Dockerfile b/starting_kit/Dockerfile deleted file mode 100644 index a1f0693..0000000 --- a/starting_kit/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM harbor.4pd.io/inf/base-python3.8-ubuntu:1.1.0 - -WORKDIR /workspace - -ADD ./requirements.txt /workspace -RUN pip install -r ./requirements.txt -i https://nexus.4pd.io/repository/pypi-all/simple --trusted-host nexus.4pd.io --extra-index-url https://mirrors.aliyun.com/pypi/simple/ \ - && pip cache purge - -ADD . /workspace - -CMD ["python", "main.py"] diff --git a/starting_kit/main.py b/starting_kit/main.py deleted file mode 100644 index 9662fd2..0000000 --- a/starting_kit/main.py +++ /dev/null @@ -1,313 +0,0 @@ -import logging -import os -import threading -import time -from typing import Optional - -import flask -import requests -from werkzeug.datastructures import FileStorage - -app = flask.Flask(__name__) -heartbeat_active = False - -log = logging.getLogger(__name__) - -log.propagate = False - -level = logging.INFO - -log.setLevel(level) - -formatter = logging.Formatter( - "[%(asctime)s] %(levelname)s : %(pathname)s:%(lineno)d - %(message)s", - "%Y-%m-%d %H:%M:%S", -) - -streamHandler = logging.StreamHandler() -streamHandler.setLevel(level) -streamHandler.setFormatter(formatter) -log.addHandler(streamHandler) - - -def heartbeat(url): - global heartbeat_active - if heartbeat_active: - return - heartbeat_active = True - while True: - try: - requests.post(url, json={"status": "RUNNING"}) - except Exception: - pass - time.sleep(10) - - -def asr( - audio_file: FileStorage, - language: Optional[str], - progressCallbackUrl: str, - taskId: str, -): - """TODO: 读取audio_file, 调用语音识别服务, 实时返回识别结果""" - - # ignore BEGIN - # 此处为榜单本地测试使用 - if os.getenv("LOCAL_TEST"): - return local_test(progressCallbackUrl, taskId) - # ignore END - - language = "de" - # 某一次识别返回 - requests.post( - progressCallbackUrl, - json={ - "taskId": taskId, - "status": "RUNNING", - "recognition_results": { # 传增量结果, status如果是FINISHED, 或者ERROR, 这个字段请不要传值 - "text": "最先启动的还是", - "final_result": True, - "para_seq": 0, - "language": language, - "start_time": 6300, - "end_time": 6421, - "words": [ - { - "text": "最", - "start_time": 6300, - "end_time": 6321, - }, - { - "text": "先", - "start_time": 6321, - "end_time": 6345, - }, - { - "text": "启", - "start_time": 6345, - "end_time": 6350, - }, - { - "text": "动", - "start_time": 6350, - "end_time": 6370, - }, - { - "text": "的", - "start_time": 6370, - "end_time": 6386, - }, - { - "text": "还", - "start_time": 6386, - "end_time": 6421, - }, - { - "text": "是", - "start_time": 6421, - "end_time": 6435, - }, - ], - }, - }, - ) - # ... 识别结果返回完毕 - - # 识别结束 - requests.post( - progressCallbackUrl, - json={ - "taskId": taskId, - "status": "FINISHED", - }, - ) - - -@app.post("/predict") -def predict(): - body = flask.request.form - language = body.get("language") - if language is None: - "自行判断语种" - taskId = body["taskId"] - progressCallbackUrl = body["progressCallbackUrl"] - heartbeatUrl = body["heartbeatUrl"] - - threading.Thread( - target=heartbeat, args=(heartbeatUrl,), daemon=True - ).start() - - audio_file = flask.request.files["file"] - # audio_file.stream # 读取文件流 - # audio_file.save("audio.mp3") # 保存文件 - threading.Thread( - target=asr, - args=(audio_file, language, progressCallbackUrl, taskId), - daemon=True, - ).start() - return flask.jsonify({"status": "OK"}) - - -# ignore BEGIN -def local_test(progressCallbackUrl: str, taskId: str): - """忽略此方法, 此方法为榜单本地调试使用""" - import random - import re - - import yaml - - def callback(content): - try: - if content is None: - requests.post( - progressCallbackUrl, - json={"taskId": taskId, "status": "FINISHED"}, - ) - else: - requests.post( - progressCallbackUrl, - json={ - "taskId": taskId, - "status": "RUNNING", - "recognition_results": content, - }, - ) - except Exception: - pass - - with open( - os.getenv("LOCAL_TEST_DATA_PATH", "../dataset/out/data.yaml") - ) as f: - data = yaml.full_load(f) - - voices = data["query_data"][0]["voice"] - - # 首次发送 - first_send_time = random.randint(3, 5) - send_interval = random.random() * 0 - log.info("首次发送%ss 发送间隔%ss" % (first_send_time, send_interval)) - time.sleep(first_send_time) - - # 将句子拼接到一起 - if random.random() < 0.3: - log.info("将部分句子合并成单句 每次合并的句子不超过3句") - rand_idx = 0 - rand_sep = [0, len(voices) - 1] - while rand_sep[rand_idx] + 1 <= rand_sep[rand_idx + 1] - 1: - rand_cursep = random.randint( - rand_sep[rand_idx] + 1, - min(rand_sep[rand_idx + 1] - 1, rand_sep[rand_idx] + 1 + 3), - ) - rand_sep.insert(rand_idx + 1, rand_cursep) - rand_idx += 1 - merged_voices = [] - for i, cur_sep in enumerate(rand_sep[:-1]): - voice = voices[cur_sep] - for j in range(cur_sep + 1, rand_sep[i + 1]): - voice["answer"] += voices[j]["answer"] - voice["end"] = voices[j]["end"] - merged_voices.append(voice) - merged_voices.append(voices[rand_sep[-1]]) - voices = merged_voices - - def split_and_keep(text, delimiters): - # 构建正则表达式模式,匹配文本或分隔符 - pattern = "|".join(re.escape(delimiter) for delimiter in delimiters) - pattern = f"(?:[^{pattern}]+|[{pattern}])" - return re.findall(pattern, text) - - puncs = [",", ".", "?", "!", ";", ":"] - - para_seq = 0 - for voice in voices: - answer: str = voice["answer"] - start_time: float = voice["start"] - end_time: float = voice["end"] - words = split_and_keep(answer, puncs) - temp_words = [] - for i, word in enumerate(words): - if i > 0 and i < len(words) - 1 and random.random() < 0.15: - log.info("随机删除word") - continue - temp_words.extend(word.split(" ")) - if len(temp_words) == 0: - temp_words = words[0].split(" ") - words = temp_words - answer = " ".join(words) - words = list(map(lambda x: x.strip(), words)) - words = list(filter(lambda x: len(x) > 0, words)) - - # 将时间均匀分配到每个字上 - words_withtime = [] - word_unittime = (end_time - start_time) / len(words) - for i, word in enumerate(words): - word_start = start_time + word_unittime * i - word_end = word_start + word_unittime - words_withtime.append( - { - "text": word, - "start_time": word_start * 1000, - "end_time": word_end * 1000, - } - ) - - # 将句子首尾的标点符号时间扩展到字上 标点符号时间为瞬间 - punc_at = 0 - while punc_at < len(words) and words[punc_at] in puncs: - punc_at += 1 - if punc_at < len(words): - words_withtime[punc_at]["start_time"] = words_withtime[0][ - "start_time" - ] - for i in range(0, punc_at): - words_withtime[i]["start_time"] = words_withtime[0]["start_time"] - words_withtime[i]["end_time"] = words_withtime[0]["start_time"] - punc_at = len(words) - 1 - while punc_at >= 0 and words[punc_at] in puncs: - punc_at -= 1 - if punc_at >= 0: - words_withtime[punc_at]["end_time"] = words_withtime[-1]["end_time"] - for i in range(punc_at + 1, len(words)): - words_withtime[i]["start_time"] = ( - words_withtime[-1]["end_time"] + 0.1 - ) - words_withtime[i]["end_time"] = words_withtime[-1]["end_time"] + 0.1 - - if random.random() < 0.4 and len(words_withtime) > 1: - log.info("发送一次final_result=False") - rand_idx = random.randint(1, len(words_withtime) - 1) - recognition_result = { - "text": " ".join( - map(lambda x: x["text"], words_withtime[:rand_idx]) - ), - "final_result": False, - "para_seq": para_seq, - "language": "de", - "start_time": start_time * 1000, - "end_time": end_time * 1000, - "words": words_withtime[:rand_idx], - } - callback(recognition_result) - - recognition_result = { - "text": answer, - "final_result": True, - "para_seq": para_seq, - "language": "de", - "start_time": start_time * 1000, - "end_time": end_time * 1000, - "words": words_withtime, - } - callback(recognition_result) - para_seq += 1 - log.info("send %s" % para_seq) - - time.sleep(send_interval) - - callback(None) - - -# ignore END - -if __name__ == "__main__": - app.run(host="0.0.0.0", port=80) diff --git a/starting_kit/requirements.txt b/starting_kit/requirements.txt deleted file mode 100644 index 09a313b..0000000 --- a/starting_kit/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -flask -requests -pyyaml diff --git a/tests/test_callback_editops.py b/tests/test_callback_editops.py deleted file mode 100644 index 90ddefd..0000000 --- a/tests/test_callback_editops.py +++ /dev/null @@ -1,16 +0,0 @@ -import json - -from schemas.dataset import QueryData -from schemas.stream import StreamDataModel -from utils.evaluator_plus import evaluate_editops - -with open("out/detail_cases.json") as f: - detail_cases = json.load(f) - -detail_case = detail_cases[0] -preds = [] -for pred in detail_case["preds"]: - preds.append(StreamDataModel.model_validate(pred)) -label = QueryData.model_validate(detail_case["label"]) - -print(evaluate_editops(label, preds)) diff --git a/tests/test_cer.py b/tests/test_cer.py deleted file mode 100644 index 6c5a66f..0000000 --- a/tests/test_cer.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -f(a, b) 计算 a -> b 的编辑距离,使用的方法是之前asr榜单的方法 -g(a, b) 计算 a -> b 的编辑距离,使用的是原始的编辑距离计算方法 -test() 是对拍程序 -""" - -import random -import string -from copy import deepcopy -from typing import List, Tuple - -import Levenshtein - - -def mapping(gt: str, dt: str): - return [i for i in gt], [i for i in dt] - - -def token_mapping( - tokens_gt: List[str], tokens_dt: List[str] -) -> Tuple[List[str], List[str]]: - arr1 = deepcopy(tokens_gt) - arr2 = deepcopy(tokens_dt) - operations = Levenshtein.editops(arr1, arr2) - for op in operations[::-1]: - if op[0] == "insert": - arr1.insert(op[1], None) - elif op[0] == "delete": - arr2.insert(op[2], None) - return arr1, arr2 - - -def cer(tokens_gt_mapping: List[str], tokens_dt_mapping: List[str]): - """输入的是经过编辑距离映射后的两个 token 序列,返回 1-cer, token-cnt""" - insert = sum(1 for item in tokens_gt_mapping if item is None) - delete = sum(1 for item in tokens_dt_mapping if item is None) - equal = sum( - 1 - for token_gt, token_dt in zip(tokens_gt_mapping, tokens_dt_mapping) - if token_gt == token_dt - ) - replace = len(tokens_gt_mapping) - insert - equal # - delete - return replace, delete, insert - - -def f(a, b): - return cer(*token_mapping(*mapping(a, b))) - - -def raw(tokens_gt, tokens_dt): - arr1 = deepcopy(tokens_gt) - arr2 = deepcopy(tokens_dt) - operations = Levenshtein.editops(arr1, arr2) - insert = 0 - delete = 0 - replace = 0 - for op in operations: - if op[0] == "insert": - insert += 1 - if op[0] == "delete": - delete += 1 - if op[0] == "replace": - replace += 1 - return replace, delete, insert - - -def g(a, b): - return raw(*mapping(a, b)) - - -def check(a, b): - ff = f(a, b) - gg = g(a, b) - if ff != gg: - print(ff, gg) - return ff == gg - - -def random_string(length): - letters = string.ascii_lowercase - return "".join(random.choice(letters) for i in range(length)) - - -def test(): - for _ in range(10000): - a = random_string(30) - b = random_string(30) - if not check(a, b): - print(a, b) - break - - -test()