update code

This commit is contained in:
zhousha
2025-08-22 18:00:46 +08:00
parent a575a38552
commit 1b78ebefdd
44 changed files with 0 additions and 2855 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@@ -1,49 +0,0 @@
FROM harbor.4pd.io/inf/base-python3.8-ubuntu:1.1.0
MAINTAINER shiguangchuan@4paradigm.com
WORKDIR /workspace
COPY ssh-keygen /bin
RUN wget -q ftp://ftp.4pd.io/pub/pico/temp/pynini-2.1.6-cp38-cp38-manylinux_2_31_x86_64.whl && pip install pynini-2.1.6-cp38-cp38-manylinux_2_31_x86_64.whl && rm -f pynini-2.1.6-c p38-cp38-manylinux_2_31_x86_64.whl
ADD ./requirements.txt /workspace
RUN pip install -r ./requirements.txt -i https://nexus.4pd.io/repository/pypi-all/simple --trusted-host nexus.4pd.io --extra-index-url https://mirrors.aliyun.com/pypi/simple/ \
&& pip cache purge \
&& ssh-keygen -f /workspace/ssh-key-ecdsa -t ecdsa -b 521 -q -N ""
ADD . /workspace
EXPOSE 80
CMD ["python3", "run_callback.py"]
###########################
## Dockerfile更新后
#FROM harbor.4pd.io/lab-platform/inf/python:3.9
#WORKDIR /app
## 安装依赖
##RUN pip install torch librosa flask
##RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && \
## pip cache purge && \
## pip --default-timeout=1000 install torch librosa flask
## 删除原来的 COPY pytorch_model.bin /app/
#COPY inference.py /app/
# 只需要复制启动脚本
#EXPOSE 80
#CMD ["python", "inference.py"]
####################
##############################更新0731#################################

BIN
helm-chart/.DS_Store vendored

Binary file not shown.

View File

@@ -1,77 +0,0 @@
## judgeflow chart 的要求
### values.yaml 文件必须包含如下字段,并且模板中必须引用 values.yaml 中的如下字段
```
podLabels
env
volumeMounts
volumes
affinity
```
### values.yaml 文件必须在 volumeMounts 中声明如下卷
```
workspace
submit
datafile
```
## 被测服务sut chart 的要求
### values.yaml 文件必须包含如下字段,并且资源模板中必须引用 values.yaml 中的如下字段
```
podLabels
affinity
```
针对 podLabels 字段values.yaml 中配置格式如下:
```
podLabels: {}
```
下面给出示例
podLabels
values.yaml
templates/deployment.yaml
```
metadata:
labels:
{{- with .Values.podLabels }}
{{- toYaml . | nindent 4 }}
{{- end }}
```
affinity
values.yaml
```
affinity: {}
```
templates/deployment.yaml
```
spec:
template:
spec:
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
```
### 如果需要在 sut 中使用共享存储,则 sut chart 的 values.yaml 也必须包含如下字段,且模板中必须引用 values.yaml 中的如下字段
```
volumeMounts
volumes
```

View File

@@ -1,23 +0,0 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

View File

@@ -1,24 +0,0 @@
apiVersion: v2
name: ${chartName}
description: Leaderboard judgeflow helm chart for demo
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: ${version}
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "${appVersion}"

View File

@@ -1,62 +0,0 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "judgeflow.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "judgeflow.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "judgeflow.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "judgeflow.labels" -}}
helm.sh/chart: {{ include "judgeflow.chart" . }}
{{ include "judgeflow.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "judgeflow.selectorLabels" -}}
app.kubernetes.io/name: {{ include "judgeflow.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "judgeflow.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "judgeflow.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

View File

@@ -1,32 +0,0 @@
{{- if .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "judgeflow.fullname" . }}
labels:
{{- include "judgeflow.labels" . | nindent 4 }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "judgeflow.fullname" . }}
minReplicas: {{ .Values.autoscaling.minReplicas }}
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
metrics:
{{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
{{- end }}
{{- end }}

View File

@@ -1,61 +0,0 @@
{{- if .Values.ingress.enabled -}}
{{- $fullName := include "judgeflow.fullname" . -}}
{{- $svcPort := .Values.service.port -}}
{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
{{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
{{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
{{- end }}
{{- end }}
{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1
{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1beta1
{{- else -}}
apiVersion: extensions/v1beta1
{{- end }}
kind: Ingress
metadata:
name: {{ $fullName }}
labels:
{{- include "judgeflow.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
ingressClassName: {{ .Values.ingress.className }}
{{- end }}
{{- if .Values.ingress.tls }}
tls:
{{- range .Values.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules:
{{- range .Values.ingress.hosts }}
- host: {{ .host | quote }}
http:
paths:
{{- range .paths }}
- path: {{ .path }}
{{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
pathType: {{ .pathType }}
{{- end }}
backend:
{{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
service:
name: {{ $fullName }}
port:
number: {{ $svcPort }}
{{- else }}
serviceName: {{ $fullName }}
servicePort: {{ $svcPort }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -1,63 +0,0 @@
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "judgeflow.fullname" . }}
labels:
{{- include "judgeflow.labels" . | nindent 4 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
template:
metadata:
labels:
{{- include "judgeflow.labels" . | nindent 8 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
{{- with .Values.priorityclassname }}
priorityClassName: "{{ . }}"
{{- end }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
{{- with .Values.env }}
env:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- if and (hasKey .Values "service") (hasKey .Values.service "ports") }}
ports:
{{- range .Values.service.ports }}
- name: {{ .name }}
containerPort: {{ .port }}
{{- end }}
{{- end }}
{{- if hasKey .Values "command" }}
command: {{ .Values.command }}
{{- end }}
volumeMounts:
{{- toYaml .Values.volumeMounts | nindent 12 }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
restartPolicy: Never
{{- with .Values.volumes }}
volumes:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
backoffLimit: 0

View File

@@ -1,10 +0,0 @@
{{- if .Values.priorityclassname }}
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: "{{ .Values.priorityclassname }}"
value: {{ .Values.priorityclassvalue }}
globalDefault: false
preemptionPolicy: "Never"
description: "This is a priority class."
{{- end }}

View File

@@ -1,22 +0,0 @@
{{- if and (hasKey .Values "service") (hasKey .Values.service "type") }}
apiVersion: v1
kind: Service
metadata:
name: {{ include "judgeflow.fullname" . }}
labels:
{{- include "judgeflow.labels" . | nindent 4 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
type: {{ .Values.service.type }}
ports:
{{- range .Values.service.ports }}
- port: {{ .port }}
targetPort: {{ .port }}
protocol: TCP
name: {{ .name }}
{{- end }}
selector:
{{- include "judgeflow.selectorLabels" . | nindent 4 }}
{{- end }}

View File

@@ -1,13 +0,0 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "judgeflow.serviceAccountName" . }}
labels:
{{- include "judgeflow.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
{{- end }}

View File

@@ -1,15 +0,0 @@
apiVersion: v1
kind: Pod
metadata:
name: {{ include "judgeflow.fullname" . }}-test-connection
labels:
{{- include "judgeflow.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": test
spec:
containers:
- name: wget
image: busybox
command: ['wget']
args: ['{{ include "judgeflow.fullname" . }}:{{ .Values.service.port }}']
restartPolicy: Never

View File

@@ -1,124 +0,0 @@
# Default values for job_demo.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
replicaCount: 1
image:
repository: "${imageRepo}"
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
tag: "${imageTag}"
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
serviceAccount:
# Specifies whether a service account should be created
create: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name: ""
podAnnotations: {}
podLabels:
contest.4pd.io/leaderboard-resource-type: judge_flow
contest.4pd.io/leaderboard-job-id: "0"
contest.4pd.io/leaderboard-submit-id: "0"
podSecurityContext: {}
# fsGroup: 2000
securityContext: {}
# capabilities:
# drop:
# - ALL
# readOnlyRootFilesystem: true
# runAsNonRoot: true
# runAsUser: 1000
service:
type: ClusterIP
ports:
- name: http
port: 80
ingress:
enabled: false
className: ""
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
hosts:
- host: chart-example.local
paths:
- path: /
pathType: ImplementationSpecific
tls: []
# - secretName: chart-example-tls
# hosts:
# - chart-example.local
resources:
# We usually recommend not to specify default resources and to leave this as a conscious
# choice for the user. This also increases chances charts run on environments with little
# resources, such as Minikube. If you do want to specify resources, uncomment the following
# lines, adjust them as necessary, and remove the curly braces after 'resources:'.
limits:
cpu: 3000m
memory: 16Gi
requests:
cpu: 3000m
memory: 16Gi
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 100
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
nodeSelector:
juicefs: "on"
contest.4pd.io/cpu: INTEL-8358
tolerations: []
affinity: {}
env:
- name: TZ
value: Asia/Shanghai
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
#command: '["python","run.py"]'
volumeMounts:
- name: workspace
mountPath: /tmp/workspace
- name: datafile
mountPath: /tmp/datafile
- name: submit
mountPath: /tmp/submit_config
- name: juicefs-pv
mountPath: /tmp/juicefs
- name: customer
mountPath: /tmp/customer
- name: submit-private
mountPath: /tmp/submit_private
volumes:
- name: juicefs-pv
persistentVolumeClaim:
claimName: juicefs-pvc
priorityclassname: ''
priorityclassvalue: '0'

Binary file not shown.

View File

@@ -1,23 +0,0 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

View File

@@ -1,24 +0,0 @@
apiVersion: v2
name: sut
description: A Helm chart for Kubernetes
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.1.0"

View File

@@ -1,62 +0,0 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "sut.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "sut.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "sut.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "sut.labels" -}}
helm.sh/chart: {{ include "sut.chart" . }}
{{ include "sut.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "sut.selectorLabels" -}}
app.kubernetes.io/name: {{ include "sut.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "sut.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "sut.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

View File

@@ -1,94 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "sut.fullname" . }}
labels:
{{- include "sut.labels" . | nindent 4 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if not .Values.autoscaling.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "sut.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "sut.labels" . | nindent 8 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "sut.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
{{- with .Values.priorityclassname }}
priorityClassName: "{{ . }}"
{{- end }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
{{- with .Values.env }}
env:
{{- toYaml . | nindent 12 }}
{{- end }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
protocol: TCP
{{- with .Values.command }}
command:
{{- toYaml . | nindent 12 }}
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- with .Values.volumeMounts }}
volumeMounts:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.livenessProbe }}
livenessProbe:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.readinessProbe }}
readinessProbe:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.startupProbe }}
startupProbe:
{{- toYaml . | nindent 12 }}
{{- end }}
volumes:
{{- with .Values.volumes }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
tolerations:
- key: "hosttype"
operator: "Equal"
value: "iluvatar"
effect: "NoSchedule"

View File

@@ -1,32 +0,0 @@
{{- if .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "sut.fullname" . }}
labels:
{{- include "sut.labels" . | nindent 4 }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "sut.fullname" . }}
minReplicas: {{ .Values.autoscaling.minReplicas }}
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
metrics:
{{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
{{- end }}
{{- end }}

View File

@@ -1,61 +0,0 @@
{{- if .Values.ingress.enabled -}}
{{- $fullName := include "sut.fullname" . -}}
{{- $svcPort := .Values.service.port -}}
{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
{{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
{{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
{{- end }}
{{- end }}
{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1
{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1beta1
{{- else -}}
apiVersion: extensions/v1beta1
{{- end }}
kind: Ingress
metadata:
name: {{ $fullName }}
labels:
{{- include "sut.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
ingressClassName: {{ .Values.ingress.className }}
{{- end }}
{{- if .Values.ingress.tls }}
tls:
{{- range .Values.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules:
{{- range .Values.ingress.hosts }}
- host: {{ .host | quote }}
http:
paths:
{{- range .paths }}
- path: {{ .path }}
{{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
pathType: {{ .pathType }}
{{- end }}
backend:
{{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
service:
name: {{ $fullName }}
port:
number: {{ $svcPort }}
{{- else }}
serviceName: {{ $fullName }}
servicePort: {{ $svcPort }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -1,18 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "sut.fullname" . }}
labels:
{{- include "sut.labels" . | nindent 4 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
type: {{ .Values.service.type }}
ports:
- port: {{ .Values.service.port }}
targetPort: http
protocol: TCP
name: socket
selector:
{{- include "sut.selectorLabels" . | nindent 4 }}

View File

@@ -1,13 +0,0 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "sut.serviceAccountName" . }}
labels:
{{- include "sut.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
{{- end }}

View File

@@ -1,15 +0,0 @@
apiVersion: v1
kind: Pod
metadata:
name: "{{ include "sut.fullname" . }}-test-connection"
labels:
{{- include "sut.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": test
spec:
containers:
- name: wget
image: busybox
command: ['wget']
args: ['{{ include "sut.fullname" . }}:{{ .Values.service.port }}']
restartPolicy: Never

View File

@@ -1,144 +0,0 @@
# Default values for sut.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
replicaCount: 1
image:
repository: harbor.4pd.io/lab-platform/inf/python
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
tag: 3.9
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
serviceAccount:
# Specifies whether a service account should be created
create: true
# Automatically mount a ServiceAccount's API credentials?
automount: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name: ""
podAnnotations: {}
podLabels: {}
podSecurityContext: {}
# fsGroup: 2000
securityContext: {}
# capabilities:
# drop:
# - ALL
# readOnlyRootFilesystem: true
# runAsNonRoot: true
# runAsUser: 1000
service:
type: ClusterIP
port: 80
ingress:
enabled: false
className: ""
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
hosts:
- host: chart-example.local
paths:
- path: /
pathType: ImplementationSpecific
tls: []
# - secretName: chart-example-tls
# hosts:
# - chart-example.local
resources:
# We usually recommend not to specify default resources and to leave this as a conscious
# choice for the user. This also increases chances charts run on environments with little
# resources, such as Minikube. If you do want to specify resources, uncomment the following
# lines, adjust them as necessary, and remove the curly braces after 'resources:'.
limits:
cpu: 1000m
memory: 4096Mi
requests:
cpu: 1000m
memory: 4096Mi
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 100
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
# Additional volumes on the output Deployment definition.
volumes: []
# - name: foo
# secret:
# secretName: mysecret
# optional: false
# Additional volumeMounts on the output Deployment definition.
volumeMounts: []
# - name: foo
# mountPath: "/etc/foo"
# readOnly: true
nodeSelector:
contest.4pd.io/accelerator: iluvatar-BI-V100
tolerations:
- key: hosttype
operator: Equal
value: iluvatar
effect: NoSchedule
affinity: {}
readinessProbe:
failureThreshold: 1000
httpGet:
path: /health
port: 80
scheme: HTTP
#readinessProbe:
# httpGet:
# path: /health
# port: 80
# scheme: HTTP
# initialDelaySeconds: 5 # 应用启动后等待 5 秒再开始探测
# failureThreshold: 5 # 连续失败 3 次后标记为未就绪
# successThreshold: 1 # 连续成功 1 次后标记为就绪
env:
- name: TZ
value: Asia/Shanghai
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: MY_NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
#command: ''
priorityclassname: ''

View File

@@ -1,64 +0,0 @@
import os
import tempfile
import shutil
if os.path.exists("/tmp/submit_private"):
shutil.rmtree("/tmp/submit_private")
with tempfile.TemporaryDirectory() as tempdir:
config_path = os.path.join(tempdir, "config.json")
assert not os.system(f"ssh-keygen -f {tempdir}/ssh-key-ecdsa -t ecdsa -b 521 -q -N \"\"")
config = """
model: whisper
model_key: whisper
config.json:
name: 'faster-whisper-server:latest'
support_devices:
- cpu
model_path: ''
port: 8080
other_ports: []
other_ports_count: 1
entrypoint: start.bat
MIN_CHUNK: 2.5
MIN_ADD_CHUNK: 2.5
COMPUTE_TYPE: int8
NUM_WORKERS: 1
CPU_THREADS: 2
BEAM_SIZE: 5
BATCH: 1
LANG: auto
DEVICE: cpu
CHUNK_LENGTH: 5
CLASS_MODEL: ./models/faster-whisper-base
EN_MODEL: ./models/faster-whisper-base
ZH_MODEL: ./models/faster-whisper-base
RU_MODEL: ./models/faster-whisper-base
PT_MODEL: ./models/faster-whisper-base
AR_MODEL: ./models/faster-whisper-base
NEW_VERSION: 1
NEED_RESET: 0
leaderboard_options:
nfs:
- name: whisper
srcRelativePath: leaderboard/pc_asr/en.tar.gz
mountPoint: /tmp
source: ceph_customer
"""
with open(config_path, "w") as f:
f.write(config)
os.environ["SSH_KEY_DIR"] = tempdir
os.environ["SUBMIT_CONFIG_FILEPATH"] = config_path
os.environ["MODEL_MAPPING"] = '{"whisper": "edge-ml.tar.gz"}'
from run_async_a10 import get_sut_url_windows
print(get_sut_url_windows())
import time
time.sleep(3600)

View File

@@ -1,8 +0,0 @@
#!/bin/bash
export DATASET_FILEPATH=dataset/formatted1/de.zip
export RESULT_FILEPATH=out/result.json
export DETAILED_CASES_FILEPATH=out/detail_cases.json
export SUBMIT_CONFIG_FILEPATH=
export BENCHMARK_NAME=
export MY_POD_IP=127.0.0.1

View File

@@ -1,24 +0,0 @@
[tool.black]
line-length = 80
target-version = ['py39']
[tool.flake8]
max-line-length = 88
count=true
per-file-ignores="./annotation/manager.py:F401"
exclude=["./label", "__pycache__", "./migrations", "./logs", "./pids", "./resources"]
ignore=["W503", "E203"]
enable-extensions="G"
application-import-names=["flake8-isort", "flake8-logging-format", "flake8-builtins"]
import-order-style="edited"
extend-ignore = ["E203", "E701"]
[tool.isort]
py_version=39
profile="black"
multi_line_output=9
line_length=80
group_by_package=true
case_sensitive=true
skip_gitignore=true

114
run.py
View File

@@ -1,114 +0,0 @@
import gc
import json
import os
import sys
import time
import zipfile
import yaml
from schemas.context import ASRContext
from utils.client import Client
from utils.evaluator import BaseEvaluator
from utils.logger import logger
from utils.service import register_sut
IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", None) is None
UNIT_TEST = os.getenv("UNIT_TEST", 0)
def main():
logger.info("执行……")
dataset_filepath = os.getenv(
"DATASET_FILEPATH",
"./tests/resources/en.zip",
)
submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "./tests/resources/submit_config")
result_filepath = os.getenv("RESULT_FILEPATH", "./out/result")
bad_cases_filepath = os.getenv("BAD_CASES_FILEPATH", "./out/badcase")
detail_cases_filepath = os.getenv("DETAILED_CASES_FILEPATH", "./out/detailcase.jsonl")
resource_name = os.getenv("BENCHMARK_NAME")
# 提交配置 & 启动被测服务
if os.getenv("DATASET_FILEPATH", ""):
from utils.helm import resource_check
with open(submit_config_filepath, "r") as fp:
st_config = yaml.safe_load(fp)
st_config["values"] = resource_check(st_config.get("values", {}))
if 'docker_images' in st_config:
sut_url = "ws://172.26.1.75:9827"
os.environ['test'] = '1'
elif 'docker_image' in st_config:
sut_url = register_sut(st_config, resource_name)
elif UNIT_TEST:
sut_url = "ws://172.27.231.36:80"
else:
logger.error("config 配置错误,没有 docker_image")
os._exit(1)
else:
os.environ['test'] = '1'
sut_url = "ws://172.27.231.36:80"
if UNIT_TEST:
exit(0)
"""
# 数据集处理
local_dataset_path = "./dataset"
os.makedirs(local_dataset_path, exist_ok=True)
with zipfile.ZipFile(dataset_filepath) as zf:
zf.extractall(local_dataset_path)
config_path = os.path.join(local_dataset_path, "data.yaml")
with open(config_path, "r") as fp:
dataset_config = yaml.safe_load(fp)
# 数据集信息
dataset_global_config = dataset_config.get("global", {})
dataset_query = dataset_config.get("query_data", {})
evaluator = BaseEvaluator()
# 开始预测
for idx, query_item in enumerate(dataset_query):
gc.collect()
logger.info(f"开始执行 {idx} 条数据")
context = ASRContext(**dataset_global_config)
context.lang = query_item.get("lang", context.lang)
context.file_path = os.path.join(local_dataset_path, query_item["file"])
# context.audio_length = query_item["audio_length"]
interactions = Client(sut_url, context).action()
context.append_labels(query_item["voice"])
context.append_preds(
interactions["predict_data"],
interactions["send_time"],
interactions["recv_time"],
)
context.fail = interactions["fail"]
if IN_TEST:
with open('output.txt', 'w') as fp:
original_stdout = sys.stdout
sys.stdout = fp
print(context)
sys.stdout = original_stdout
evaluator.evaluate(context)
detail_case = evaluator.gen_detail_case()
with open(detail_cases_filepath, "a") as fp:
fp.write(json.dumps(detail_case.to_dict(), ensure_ascii=False) + "\n")
time.sleep(4)
evaluator.post_evaluate()
output_result = evaluator.gen_result()
# print(evaluator.__dict__)
logger.info("执行完成. Result = {output_result}")
with open(result_filepath, "w") as fp:
json.dump(output_result, fp, indent=2, ensure_ascii=False)
with open(bad_cases_filepath, "w") as fp:
fp.write("当前榜单不存在 Bad Case\n")
"""
if __name__ == "__main__":
main()

View File

@@ -1,757 +0,0 @@
import atexit
import concurrent.futures
import fcntl
import gc
import glob
import json
import os
import random
import signal
import sys
import tempfile
import threading
import time
import zipfile
from concurrent.futures import ThreadPoolExecutor
import yaml
from fabric import Connection
from vmplatform import VMOS, Client, VMDataDisk
from schemas.context import ASRContext
from utils.client_async import ClientAsync
from utils.evaluator import BaseEvaluator
from utils.logger import logger
from utils.service import register_sut
IN_TEST = os.getenv("SUBMIT_CONFIG_FILEPATH", None) is None
UNIT_TEST = os.getenv("UNIT_TEST", 0)
DATASET_NUM = os.getenv("DATASET_NUM")
# vm榜单参数
SUT_TYPE = os.getenv("SUT_TYPE", "kubernetes")
SHARE_SUT = os.getenv("SHARE_SUT", "true") == "true"
VM_ID = 0
VM_IP = ""
do_deploy_chart = True
VM_CPU = int(os.getenv("VM_CPU", "2"))
VM_MEM = int(os.getenv("VM_MEM", "4096"))
MODEL_BASEPATH = os.getenv("MODEL_BASEPATH", "/tmp/customer/leaderboard/pc_asr")
MODEL_MAPPING = json.loads(os.getenv("MODEL_MAPPING", "{}"))
SSH_KEY_DIR = os.getenv("SSH_KEY_DIR", "/workspace")
SSH_PUBLIC_KEY_FILE = os.path.join(SSH_KEY_DIR, "ssh-key-ecdsa.pub")
SSH_KEY_FILE = os.path.join(SSH_KEY_DIR, "ssh-key-ecdsa")
CONNECT_KWARGS = {"key_filename": SSH_KEY_FILE}
# 共享sut参数
JOB_ID = os.getenv("JOB_ID")
dirname = "/tmp/submit_private/sut_share"
os.makedirs(dirname, exist_ok=True)
SUT_SHARE_LOCK = os.path.join(dirname, "lock.lock")
SUT_SHARE_USE_LOCK = os.path.join(dirname, "use.lock")
SUT_SHARE_STATUS = os.path.join(dirname, "status.json")
SUT_SHARE_JOB_STATUS = os.path.join(dirname, f"job_status.{JOB_ID}")
SUT_SHARE_PUBLIC_FAIL = os.path.join(dirname, "one_job_failed")
fd_lock = open(SUT_SHARE_USE_LOCK, "a")
def clean_vm_atexit():
global VM_ID, do_deploy_chart
if not VM_ID:
return
if not do_deploy_chart:
return
logger.info("删除vm")
vmclient = Client()
err_msg = vmclient.delete_vm(VM_ID)
if err_msg:
logger.warning(f"删除vm失败: {err_msg}")
def put_file_to_vm(c: Connection, local_path: str, remote_path: str):
logger.info(f"uploading file {local_path} to {remote_path}")
result = c.put(local_path, remote_path)
logger.info("uploaded {0.local} to {0.remote}".format(result))
def deploy_windows_sut():
global VM_ID
global VM_IP
submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "")
with open(submit_config_filepath, "r") as fp:
st_config = yaml.safe_load(fp)
assert "model" in st_config, "未配置model"
assert "model_key" in st_config, "未配置model_key"
assert "config.json" in st_config, "未配置config.json"
nfs = st_config.get("leaderboard_options", {}).get("nfs", [])
assert len(nfs) > 0, "未配置nfs"
assert st_config["model"] in MODEL_MAPPING, "提交模型不在可用模型范围内"
model = st_config["model"]
model_key = st_config["model_key"]
model_path = ""
config = st_config["config.json"]
exist = False
for nfs_item in nfs:
if nfs_item["name"] == model_key:
exist = True
if nfs_item["source"] == "ceph_customer":
model_path = os.path.join(
"/tmp/customer",
nfs_item["srcRelativePath"],
)
else:
model_path = os.path.join(
"/tmp/juicefs",
nfs_item["srcRelativePath"],
)
break
if not exist:
raise RuntimeError(f"未找到nfs配置项 name={model_key}")
config_path = os.path.join(tempfile.mkdtemp(), "config.json")
model_dir = os.path.basename(model_path).split(".")[0]
config["model_path"] = f"E:\\model\\{model_dir}"
with open(config_path, "w") as fp:
json.dump(config, fp, ensure_ascii=False, indent=4)
vmclient = Client()
with open(SSH_PUBLIC_KEY_FILE, "r") as fp:
sshpublickey = fp.read().rstrip()
VM_ID = vmclient.create_vm(
"amd64",
VMOS.windows10,
VM_CPU,
VM_MEM,
"leaderboard-%s-submit-%s-job-%s"
% (
os.getenv("BENCHMARK_NAME"),
os.getenv("SUBMIT_ID"),
os.getenv("JOB_ID"),
),
sshpublickey,
datadisks=[
VMDataDisk(
size=50,
disk_type="ssd",
mount_path="/",
filesystem="NTFS",
)
],
)
atexit.register(clean_vm_atexit)
signal.signal(signal.SIGTERM, lambda signum, _: sys.exit(signum))
VM_IP = vmclient.wait_until_vm_running(VM_ID)
logger.info("vm created successfully, vm_ip: %s", VM_IP)
def sut_startup():
with Connection(
VM_IP,
"administrator",
connect_kwargs=CONNECT_KWARGS,
) as c:
script_path = "E:\\base\\asr\\faster-whisper\\server"
script_path = "E:\\install\\asr\\sensevoice\\server"
bat_filepath = f"{script_path}\\start.bat"
config_filepath = "E:\\submit\\config.json"
result = c.run("")
assert result.ok
c.run(
f'cd /d {script_path} & set "EDGE_ML_ENV_HOME=E:\\install" & {bat_filepath} {config_filepath}',
warn=True,
)
with Connection(
VM_IP,
"administrator",
connect_kwargs=CONNECT_KWARGS,
) as c:
model_filepath = os.path.join(MODEL_BASEPATH, MODEL_MAPPING[model])
filename = os.path.basename(model_filepath)
put_file_to_vm(c, model_filepath, "/E:/")
result = c.run("mkdir E:\\base")
assert result.ok
result = c.run("mkdir E:\\model")
assert result.ok
result = c.run("mkdir E:\\submit")
assert result.ok
result = c.run(
f"tar zxvf E:\\{filename} -C E:\\base --strip-components 1"
)
assert result.ok
result = c.run("E:\\base\\setup-win.bat E:\\install")
assert result.ok
put_file_to_vm(c, config_path, "/E:/submit")
put_file_to_vm(c, model_path, "/E:/model")
result = c.run(
f"tar zxvf E:\\model\\{os.path.basename(model_path)} -C E:\\model"
)
assert result.ok
threading.Thread(target=sut_startup, daemon=True).start()
time.sleep(60)
return f"ws://{VM_IP}:{config['port']}"
def deploy_macos_sut():
global VM_ID
global VM_IP
submit_config_filepath = os.getenv("SUBMIT_CONFIG_FILEPATH", "")
with open(submit_config_filepath, "r") as fp:
st_config = yaml.safe_load(fp)
assert "model" in st_config, "未配置model"
assert "model_key" in st_config, "未配置model_key"
assert "config.json" in st_config, "未配置config.json"
nfs = st_config.get("leaderboard_options", {}).get("nfs", [])
assert len(nfs) > 0, "未配置nfs"
assert st_config["model"] in MODEL_MAPPING, "提交模型不在可用模型范围内"
model = st_config["model"]
model_key = st_config["model_key"]
model_path = ""
config = st_config["config.json"]
exist = False
for nfs_item in nfs:
if nfs_item["name"] == model_key:
exist = True
if nfs_item["source"] == "ceph_customer":
model_path = os.path.join(
"/tmp/customer",
nfs_item["srcRelativePath"],
)
else:
model_path = os.path.join(
"/tmp/juicefs",
nfs_item["srcRelativePath"],
)
break
if not exist:
raise RuntimeError(f"未找到nfs配置项 name={model_key}")
config_path = os.path.join(tempfile.mkdtemp(), "config.json")
model_dir = os.path.basename(model_path).split(".")[0]
vmclient = Client()
with open(SSH_PUBLIC_KEY_FILE, "r") as fp:
sshpublickey = fp.read().rstrip()
VM_ID = vmclient.create_vm(
"amd64",
VMOS.macos12,
VM_CPU,
VM_MEM,
"leaderboard-%s-submit-%s-job-%s"
% (
os.getenv("BENCHMARK_NAME"),
os.getenv("SUBMIT_ID"),
os.getenv("JOB_ID"),
),
sshpublickey,
datadisks=[
VMDataDisk(
size=50,
disk_type="ssd",
mount_path="/",
filesystem="apfs",
)
],
)
atexit.register(clean_vm_atexit)
signal.signal(signal.SIGTERM, lambda signum, _: sys.exit(signum))
VM_IP = vmclient.wait_until_vm_running(VM_ID)
logger.info("vm created successfully, vm_ip: %s", VM_IP)
with Connection(
VM_IP,
"admin",
connect_kwargs=CONNECT_KWARGS,
) as c:
result = c.run("ls -d /Volumes/data*")
assert result.ok
volume_path = result.stdout.strip()
config["model_path"] = f"{volume_path}/model/{model_dir}"
with open(config_path, "w") as fp:
json.dump(config, fp, ensure_ascii=False, indent=4)
def sut_startup():
with Connection(
VM_IP,
"admin",
connect_kwargs=CONNECT_KWARGS,
) as c:
script_path = f"{volume_path}/install/asr/sensevoice/server"
startsh = f"{script_path}/start.sh"
config_filepath = f"{volume_path}/submit/config.json"
c.run(
f"cd {script_path} && sh {startsh} {config_filepath}",
warn=True,
)
with Connection(
VM_IP,
"admin",
connect_kwargs=CONNECT_KWARGS,
) as c:
model_filepath = os.path.join(MODEL_BASEPATH, MODEL_MAPPING[model])
filename = os.path.basename(model_filepath)
put_file_to_vm(c, model_filepath, f"{volume_path}")
result = c.run(f"mkdir {volume_path}/base")
assert result.ok
result = c.run(f"mkdir {volume_path}/model")
assert result.ok
result = c.run(f"mkdir {volume_path}/submit")
assert result.ok
result = c.run(
f"tar zxvf {volume_path}/{filename} -C {volume_path}/base --strip-components 1" # noqa: E501
)
assert result.ok
result = c.run(
f"sh {volume_path}/base/setup-mac.sh {volume_path}/install x64"
)
assert result.ok
put_file_to_vm(c, config_path, f"{volume_path}/submit")
put_file_to_vm(c, model_path, f"{volume_path}/model")
result = c.run(
f"tar zxvf {volume_path}/model/{os.path.basename(model_path)} -C {volume_path}/model" # noqa: E501
)
assert result.ok
threading.Thread(target=sut_startup, daemon=True).start()
time.sleep(60)
return f"ws://{VM_IP}:{config['port']}"
def get_sut_url_vm(vm_type: str):
global VM_ID
global VM_IP
global do_deploy_chart
do_deploy_chart = True
# 拉起SUT
def check_job_failed():
while True:
time.sleep(30)
if os.path.exists(SUT_SHARE_PUBLIC_FAIL):
logger.error("there is a job failed in current submit")
sys.exit(1)
sut_url = ""
threading.Thread(target=check_job_failed, daemon=True).start()
if SHARE_SUT:
time.sleep(10 * random.random())
try:
open(SUT_SHARE_LOCK, "x").close()
except Exception:
do_deploy_chart = False
start_at = time.time()
def file_last_updated_at(file: str):
return os.stat(file).st_mtime if os.path.exists(file) else start_at
if not do_deploy_chart:
with open(SUT_SHARE_JOB_STATUS, "w") as f:
f.write("waiting")
while (
time.time() - file_last_updated_at(SUT_SHARE_STATUS)
<= 60 * 60 * 24
):
logger.info(
"Waiting sut application to be deployed by another job"
)
time.sleep(10 + random.random())
if os.path.exists(SUT_SHARE_STATUS):
get_status = False
for _ in range(10):
try:
with open(SUT_SHARE_STATUS, "r") as f:
status = json.load(f)
get_status = True
break
except Exception:
time.sleep(1 + random.random())
continue
if not get_status:
raise RuntimeError(
"Failed to get status of sut application"
)
assert (
status.get("status") != "failed"
), "Failed to deploy sut application, \
please check other job logs"
if status.get("status") == "running":
VM_ID = status.get("vmid")
VM_IP = status.get("vmip")
sut_url = status.get("sut_url")
with open(SSH_PUBLIC_KEY_FILE, "w") as fp:
fp.write(status.get("pubkey"))
with open(SSH_KEY_FILE, "w") as fp:
fp.write(status.get("prikey"))
logger.info("Successfully get deployed sut application")
break
if do_deploy_chart:
try:
fcntl.flock(fd_lock, fcntl.LOCK_EX)
with open(SUT_SHARE_JOB_STATUS, "w") as f:
f.write("waiting")
pending = True
def update_status():
while pending:
time.sleep(30)
if not pending:
break
with open(SUT_SHARE_STATUS, "w") as f:
json.dump({"status": "pending"}, f)
threading.Thread(target=update_status, daemon=True).start()
if vm_type == "windows":
sut_url = deploy_windows_sut()
else:
sut_url = deploy_macos_sut()
except Exception:
open(SUT_SHARE_PUBLIC_FAIL, "w").close()
with open(SUT_SHARE_STATUS, "w") as f:
json.dump({"status": "failed"}, f)
raise
finally:
pending = False
with open(SUT_SHARE_STATUS, "w") as f:
pubkey = ""
with open(SSH_PUBLIC_KEY_FILE, "r") as fp:
pubkey = fp.read().rstrip()
prikey = ""
with open(SSH_KEY_FILE, "r") as fp:
prikey = fp.read()
json.dump(
{
"status": "running",
"vmid": VM_ID,
"vmip": VM_IP,
"pubkey": pubkey,
"sut_url": sut_url,
"prikey": prikey,
},
f,
)
else:
while True:
time.sleep(5 + random.random())
try:
fcntl.flock(fd_lock, fcntl.LOCK_EX | fcntl.LOCK_NB)
break
except Exception:
logger.info("尝试抢占调用sut失败继续等待 5s ...")
with open(SUT_SHARE_JOB_STATUS, "w") as f:
f.write("running")
return sut_url
def get_sut_url():
if SUT_TYPE in ("windows", "macos"):
return get_sut_url_vm(SUT_TYPE)
submit_config_filepath = os.getenv(
"SUBMIT_CONFIG_FILEPATH", "./tests/resources/submit_config"
)
CPU = os.getenv("SUT_CPU", "2")
MEMORY = os.getenv("SUT_MEMORY", "4Gi")
resource_name = os.getenv("BENCHMARK_NAME")
# 任务信息
# 斯拉夫语族:俄语、波兰语
# 日耳曼语族:英语、德语、荷兰语
# 拉丁语族(罗曼语族):西班牙语、葡萄牙语、法国语、意大利语
# 闪米特语族:阿拉伯语、希伯来语
# 提交配置 & 启动被测服务
if os.getenv("DATASET_FILEPATH", ""):
with open(submit_config_filepath, "r") as fp:
st_config = yaml.safe_load(fp)
if "values" not in st_config:
st_config["values"] = {}
st_config["values"]["resources"] = {}
st_config["values"]["resources"]["limits"] = {}
st_config["values"]["resources"]["limits"]["cpu"] = CPU
st_config["values"]["resources"]["limits"]["memory"] = MEMORY
# st_config["values"]['resources']['limits']['nvidia.com/gpu'] = '1'
# st_config["values"]['resources']['limits']['nvidia.com/gpumem'] = "1843"
# st_config["values"]['resources']['limits']['nvidia.com/gpucores'] = "8"
st_config["values"]["resources"]["requests"] = {}
st_config["values"]["resources"]["requests"]["cpu"] = CPU
st_config["values"]["resources"]["requests"]["memory"] = MEMORY
# st_config["values"]['resources']['requests']['nvidia.com/gpu'] = '1'
# st_config["values"]['resources']['requests']['nvidia.com/gpumem'] = "1843"
# st_config["values"]['resources']['requests']['nvidia.com/gpucores'] = "8"
# st_config['values']['nodeSelector'] = {}
# st_config["values"]["nodeSelector"][
# "contest.4pd.io/accelerator"
# ] = "A10vgpu"
# st_config['values']['tolerations'] = []
# toleration_item = {}
# toleration_item['key'] = 'hosttype'
# toleration_item['operator'] = 'Equal'
# toleration_item['value'] = 'vgpu'
# toleration_item['effect'] = 'NoSchedule'
# st_config['values']['tolerations'].append(toleration_item)
if os.getenv("RESOURCE_TYPE", "cpu") == "cpu":
values = st_config["values"]
limits = values.get("resources", {}).get("limits", {})
requests = values.get("resources", {}).get("requests", {})
if (
"nvidia.com/gpu" in limits
or "nvidia.com/gpumem" in limits
or "nvidia.com/gpucores" in limits
or "nvidia.com/gpu" in requests
or "nvidia.com/gpumem" in requests
or "nvidia.com/gpucores" in requests
):
raise Exception("禁止使用GPU!")
else:
vgpu_num = int(os.getenv("SUT_VGPU", "3"))
st_config["values"]["resources"]["limits"]["nvidia.com/gpu"] = (
str(vgpu_num)
)
st_config["values"]["resources"]["limits"][
"nvidia.com/gpumem"
] = str(1843 * vgpu_num)
st_config["values"]["resources"]["limits"][
"nvidia.com/gpucores"
] = str(8 * vgpu_num)
st_config["values"]["resources"]["requests"][
"nvidia.com/gpu"
] = str(vgpu_num)
st_config["values"]["resources"]["requests"][
"nvidia.com/gpumem"
] = str(1843 * vgpu_num)
st_config["values"]["resources"]["requests"][
"nvidia.com/gpucores"
] = str(8 * vgpu_num)
st_config["values"]["nodeSelector"] = {}
st_config["values"]["nodeSelector"][
"contest.4pd.io/accelerator"
] = "A10vgpu"
st_config["values"]["tolerations"] = []
toleration_item = {}
toleration_item["key"] = "hosttype"
toleration_item["operator"] = "Equal"
toleration_item["value"] = "vgpu"
toleration_item["effect"] = "NoSchedule"
st_config["values"]["tolerations"].append(toleration_item)
if "docker_images" in st_config:
sut_url = "ws://172.26.1.75:9827"
os.environ["test"] = "1"
elif "docker_image" in st_config:
sut_url = register_sut(st_config, resource_name)
elif UNIT_TEST:
sut_url = "ws://172.27.231.36:80"
else:
logger.error("config 配置错误,没有 docker_image")
os._exit(1)
return sut_url
else:
os.environ["test"] = "1"
sut_url = "ws://172.27.231.36:80"
sut_url = "ws://172.26.1.75:9827"
return sut_url
def load_merge_dataset(dataset_filepath: str) -> dict:
local_dataset_path = "./dataset"
os.makedirs(local_dataset_path, exist_ok=True)
with zipfile.ZipFile(dataset_filepath) as zf:
zf.extractall(local_dataset_path)
config = {}
sub_datasets = os.listdir(local_dataset_path)
for sub_dataset in sub_datasets:
if sub_dataset.startswith("asr."):
lang = sub_dataset[4:]
lang_path = os.path.join(local_dataset_path, lang)
os.makedirs(lang_path, exist_ok=True)
with zipfile.ZipFile(
os.path.join(local_dataset_path, sub_dataset)
) as zf:
zf.extractall(lang_path)
lang_config_path = os.path.join(lang_path, "data.yaml")
with open(lang_config_path, "r") as fp:
lang_config = yaml.safe_load(fp)
audio_lengths = {}
for query_item in lang_config.get("query_data", []):
audio_path = os.path.join(
lang_path,
query_item["file"],
)
query_item["file"] = audio_path
audio_lengths[query_item["file"]] = os.path.getsize(
audio_path,
)
lang_config["query_data"] = sorted(
lang_config.get("query_data", []),
key=lambda x: audio_lengths[x["file"]],
reverse=True,
)
idx = 0
length = 0.0
for query_item in lang_config["query_data"]:
audio_length = audio_lengths[query_item["file"]]
length += audio_length / 32000
idx += 1
# 每个语言限制半个小时长度
if length >= 30 * 60:
break
lang_config["query_data"] = lang_config["query_data"][:idx]
config[lang] = lang_config
config["query_data"] = []
for lang, lang_config in config.items():
if lang == "query_data":
continue
for query_item in lang_config["query_data"]:
config["query_data"].append(
{
**query_item,
"lang": lang,
}
)
random.Random(0).shuffle(config["query_data"])
return config
def postprocess_failed():
open(SUT_SHARE_PUBLIC_FAIL, "w").close()
def main():
dataset_filepath = os.getenv(
"DATASET_FILEPATH",
"/Users/4paradigm/Projects/dataset/asr/de.zip",
# "./tests/resources/en.zip",
)
result_filepath = os.getenv("RESULT_FILEPATH", "./out/result")
bad_cases_filepath = os.getenv("BAD_CASES_FILEPATH", "./out/badcase")
detail_cases_filepath = os.getenv(
"DETAILED_CASES_FILEPATH", "./out/detailcase.jsonl"
)
thread_num = int(os.getenv("THREAD_NUM", "1"))
# 数据集处理
config = {}
if os.getenv("MERGE_DATASET", "1"):
config = load_merge_dataset(dataset_filepath)
dataset_query = config["query_data"]
else:
local_dataset_path = "./dataset"
os.makedirs(local_dataset_path, exist_ok=True)
with zipfile.ZipFile(dataset_filepath) as zf:
zf.extractall(local_dataset_path)
config_path = os.path.join(local_dataset_path, "data.yaml")
with open(config_path, "r") as fp:
dataset_config = yaml.safe_load(fp)
# 读取所有的音频,进而获得音频的总长度,最后按照音频长度对 query_data 进行降序排序
lang = os.getenv("lang")
if lang is None:
lang = dataset_config.get("global", {}).get("lang", "en")
audio_lengths = []
for query_item in dataset_config.get("query_data", []):
query_item["lang"] = lang
audio_path = os.path.join(local_dataset_path, query_item["file"])
query_item["file"] = audio_path
audio_lengths.append(os.path.getsize(audio_path) / 1024 / 1024)
dataset_config["query_data"] = sorted(
dataset_config.get("query_data", []),
key=lambda x: audio_lengths[dataset_config["query_data"].index(x)],
reverse=True,
)
# 数据集信息
# dataset_global_config = dataset_config.get("global", {})
dataset_query = dataset_config.get("query_data", {})
config[lang] = dataset_config
# sut url
sut_url = get_sut_url()
try:
# 开始测试
logger.info("开始执行")
evaluator = BaseEvaluator()
future_list = []
with ThreadPoolExecutor(max_workers=thread_num) as executor:
for idx, query_item in enumerate(dataset_query):
context = ASRContext(
**config[query_item["lang"]].get("global", {}),
)
context.lang = query_item["lang"]
context.file_path = query_item["file"]
context.append_labels(query_item["voice"])
future = executor.submit(
ClientAsync(sut_url, context, idx).action
)
future_list.append(future)
for future in concurrent.futures.as_completed(future_list):
context = future.result()
evaluator.evaluate(context)
detail_case = evaluator.gen_detail_case()
with open(detail_cases_filepath, "a") as fp:
fp.write(
json.dumps(
detail_case.to_dict(),
ensure_ascii=False,
)
+ "\n",
)
del context
gc.collect()
evaluator.post_evaluate()
output_result = evaluator.gen_result()
logger.info("执行完成")
with open(result_filepath, "w") as fp:
json.dump(output_result, fp, indent=2, ensure_ascii=False)
with open(bad_cases_filepath, "w") as fp:
fp.write("当前榜单不存在 Bad Case\n")
if SHARE_SUT:
with open(SUT_SHARE_JOB_STATUS, "w") as f:
f.write("success")
fcntl.flock(fd_lock, fcntl.LOCK_UN)
fd_lock.close()
while SHARE_SUT and do_deploy_chart:
time.sleep(30)
success_num = 0
for job_status_file in glob.glob(dirname + "/job_status.*"):
with open(job_status_file, "r") as f:
job_status = f.read()
success_num += job_status == "success"
if success_num == int(DATASET_NUM):
break
logger.info("Waiting for all jobs to complete")
except Exception:
if SHARE_SUT:
postprocess_failed()
raise
sys.exit(0)
if __name__ == "__main__":
main()

View File

View File

@@ -1,90 +0,0 @@
import os
from copy import deepcopy
from typing import Dict, List, Optional
from pydantic import BaseModel, Field
from schemas.stream import StreamDataModel
class LabelContext(BaseModel):
start: float
end: float
answer: str
class PredContext(BaseModel):
recognition_results: StreamDataModel
recv_time: Optional[float] = Field(None)
send_time: Optional[float] = Field(None)
class ASRContext:
def __init__(self, **kwargs):
self.bits = kwargs.get("bits", 16)
self.channel = kwargs.get("channel", 1)
self.sample_rate = kwargs.get("sample_rate", 16000)
self.audio_format = kwargs.get("format", "wav")
self.enable_words = kwargs.get("enable_words", True)
self.char_contains_rate = kwargs.get("char_contains_rate", 0.8)
self.lang = os.getenv("lang")
if self.lang is None:
self.lang = kwargs.get("lang", "en")
self.stream = kwargs.get("stream", True)
self.wait_time = float(os.getenv("wait_time", 0.1))
self.chunk_size = self.sample_rate * self.bits / 8 * self.wait_time
if int(os.getenv('chunk_size_set', 0)):
self.chunk_size = int(os.getenv('chunk_size_set', 0))
self.audio_length = 0
self.file_path = ""
self.labels: List[LabelContext] = kwargs.get("labels", [])
self.preds: List[PredContext] = kwargs.get("preds", [])
self.label_sentences: List[str] = []
self.pred_sentences: List[str] = []
self.send_time_start_end = []
self.recv_time_start_end = []
self.fail = False
self.fail_char_contains_rate_num = 0
self.punctuation_num = 0
self.pred_punctuation_num = 0
def append_labels(self, voices: List[Dict]):
for voice_data in voices:
label_context = LabelContext(**voice_data)
self.labels.append(label_context)
def append_preds(
self,
predict_data: List[StreamDataModel],
send_time: List[float],
recv_time: List[float],
):
self.send_time_start_end = [send_time[0], send_time[-1]] if len(send_time) > 0 else []
self.recv_time_start_end = [recv_time[0], recv_time[-1]] if len(recv_time) > 0 else []
for pred_item, send_time_item, recv_time_item in zip(predict_data, send_time, recv_time):
pred_item = deepcopy(pred_item)
pred_context = PredContext(recognition_results=pred_item.model_dump())
pred_context.send_time = send_time_item
pred_context.recv_time = recv_time_item
self.preds.append(pred_context)
def to_dict(self):
return {
"bits": self.bits,
"channel": self.channel,
"sample_rate": self.sample_rate,
"audio_format": self.audio_format,
"enable_words": self.enable_words,
"stream": self.stream,
"wait_time": self.wait_time,
"chunk_size": self.chunk_size,
"labels": [item.model_dump_json() for item in self.labels],
"preds": [item.model_dump_json() for item in self.preds],
}

View File

@@ -1,18 +0,0 @@
from typing import List
from pydantic import BaseModel, Field
class QueryDataSentence(BaseModel):
answer: str = Field(description="文本label")
start: float = Field(description="句子开始时间")
end: float = Field(description="句子结束时间")
class QueryData(BaseModel):
lang: str = Field(description="语言")
file: str = Field(description="音频文件位置")
duration: float = Field(description="音频长度")
voice: List[QueryDataSentence] = Field(
description="音频文件的文本label内容"
)

View File

@@ -1,66 +0,0 @@
from typing import List
from pydantic import BaseModel, ValidationError, field_validator
from pydantic import model_validator
class StreamWordsModel(BaseModel):
text: str
start_time: float
end_time: float
@model_validator(mode="after")
def check_result(self):
if self.end_time < self.start_time:
raise ValidationError("end-time 小于 start-time, error")
return self
class StreamDataModel(BaseModel):
text: str
language: str
final_result: bool
para_seq: int
start_time: float
end_time: float
words: List[StreamWordsModel]
@model_validator(mode="after")
def check_result(self):
if self.end_time < self.start_time:
raise ValidationError("end-time 小于 start-time, error")
return self
class StreamResultModel(BaseModel):
asr_results: StreamDataModel
@field_validator('asr_results', mode="after")
def convert_to_seconds(cls, v: StreamDataModel, values):
# 在这里处理除以1000的逻辑
v.end_time = v.end_time / 1000
v.start_time = v.start_time / 1000
for word in v.words:
word.start_time /= 1000
word.end_time /= 1000
return v
class Config:
validate_assignment = True
class NonStreamDataModel(BaseModel):
text: str
para_seq: int
start_time: float
end_time: float
@model_validator(mode="after")
def check_result(self):
if self.end_time < self.start_time:
raise ValidationError("end-time 小于 start-time, error")
return self
class NonStreamResultModel(BaseModel):
contents: List[NonStreamDataModel]

View File

@@ -1,53 +0,0 @@
import os
import sys
from collections import defaultdict
import yaml
def main(dataset_dir):
dirs = os.listdir(dataset_dir)
dirs = list(
filter(lambda x: os.path.isdir(os.path.join(dataset_dir, x)), dirs)
)
problem_dirs = set()
problem_count = defaultdict(int)
for dir in dirs:
with open(os.path.join(dataset_dir, dir, "data.yaml"), "r") as f:
data = yaml.full_load(f)
for query_i, query in enumerate(data["query_data"]):
voices = sorted(query["voice"], key=lambda x: x["start"])
if voices != query["voice"]:
print("-----", dir)
if voices[0]["start"] > voices[0]["end"]:
print(
"err1: %s%s个query的第%d个voice的start大于end: %s"
% (dir, query_i, 0, voices[0]["answer"])
)
problem_dirs.add(dir)
for voice_i in range(1, len(voices)):
voice = voices[voice_i]
if voice["start"] > voice["end"]:
print(
"err1: %s%s个query的第%d个voice的start大于end: %s"
% (dir, query_i, voice_i, voice["answer"])
)
problem_dirs.add(dir)
if voice["start"] < voices[voice_i - 1]["end"]:
print(
"err2: %s%s个query的第%d个voice的start小于前一个voice的end: %s"
% (dir, query_i, voice_i, voice["answer"])
)
problem_dirs.add(dir)
problem_count[dir] += 1
print(len(dirs))
print(problem_dirs)
print(problem_count)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("指定 测试数据集文件夹")
sys.exit(1)
main(sys.argv[1])

View File

@@ -1,108 +0,0 @@
import json
import os
import shutil
import sys
import zipfile
import yaml
"""
target
{
"global": {
"lang": ""
},
"query_data": [
"file": "",
"duration": 2.0,
"voice": [
{
"answer": "",
"start": 0.0,
"end": 1.0
}
]
]
}
"""
def situation_a(meta, dataset_folder, output_folder):
"""
{
"combined": {
"en": [
{
"wav": "*.wav",
"transcriptions": [
{
"text": "",
"start": 0.0,
"end": 1.0
}
],
"duration": 2.0
}
]
}
}
"""
meta = meta["combined"]
for lang, arr in meta.items():
print("processing", lang)
assert len(lang) == 2
lang_folder = os.path.join(output_folder, lang)
os.makedirs(lang_folder, exist_ok=True)
data = {"global": {"lang": lang}, "query_data": []}
query_data = data["query_data"]
for item in arr:
os.makedirs(
os.path.join(lang_folder, os.path.dirname(item["wav"])),
exist_ok=True,
)
mp3_file = item["wav"][:-4] + ".mp3"
shutil.copyfile(
os.path.join(dataset_folder, mp3_file),
os.path.join(lang_folder, mp3_file),
)
query_data_item = {
"file": mp3_file,
"duration": float(item["duration"]),
"voice": [],
}
query_data.append(query_data_item)
voice = query_data_item["voice"]
for v in item["transcriptions"]:
voice.append(
{
"answer": v["text"],
"start": float(v["start"]),
"end": float(v["end"]),
}
)
with open(os.path.join(lang_folder, "data.yaml"), "w") as f:
yaml.dump(data, f, indent=2, allow_unicode=True, encoding="utf-8")
with zipfile.ZipFile(
os.path.join(output_folder, lang + ".zip"), "w"
) as ziper:
dirname = lang_folder
for path, _, files in os.walk(dirname):
for file in files:
ziper.write(
os.path.join(path, file),
os.path.join(path[len(dirname) :], file),
zipfile.ZIP_DEFLATED,
)
if __name__ == "__main__":
if len(sys.argv) < 3:
print("指定 数据集文件夹路径 输出路径")
sys.exit(1)
dataset_folder = sys.argv[1]
output_folder = sys.argv[2]
with open(os.path.join(dataset_folder, "meta.json")) as f:
meta = json.load(f)
situation_a(meta, dataset_folder, output_folder)

View File

@@ -1,56 +0,0 @@
import json
import sys
from schemas.dataset import QueryData
from schemas.stream import StreamDataModel
from utils.evaluator_plus import evaluate_editops
def main(detailcase_file: str):
with open(detailcase_file) as f:
d = json.load(f)[0]
preds = d["preds"]
preds = list(map(lambda x: StreamDataModel(**x), preds))
preds = list(filter(lambda x: x.final_result, preds))
label = d["label"]
label = QueryData(**label)
print(evaluate_editops(label, preds))
def evaluate_from_record(detailcase_file: str, record_path: str):
with open(detailcase_file) as f:
d = json.load(f)[0]
label = d["label"]
label = QueryData(**label)
with open(record_path) as f:
record = json.load(f)
tokens_pred = record["tokens_pred"]
tokens_label = record["tokens_label"]
recognition_results = record["recognition_results"]
recognition_results = list(
map(lambda x: StreamDataModel(**x), recognition_results)
)
a, b = [], []
for i, rr in enumerate(recognition_results):
if rr.final_result:
a.append(tokens_pred[i])
b.append(rr)
tokens_pred = a
recognition_results = b
print(
evaluate_editops(
label,
recognition_results,
tokens_pred,
tokens_label,
)
)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("请指定 detailcase 文件路径")
sys.exit(1)
main(sys.argv[1])
# evaluate_from_record(sys.argv[1], sys.argv[2])

Binary file not shown.

View File

@@ -1,11 +0,0 @@
FROM harbor.4pd.io/inf/base-python3.8-ubuntu:1.1.0
WORKDIR /workspace
ADD ./requirements.txt /workspace
RUN pip install -r ./requirements.txt -i https://nexus.4pd.io/repository/pypi-all/simple --trusted-host nexus.4pd.io --extra-index-url https://mirrors.aliyun.com/pypi/simple/ \
&& pip cache purge
ADD . /workspace
CMD ["python", "main.py"]

View File

@@ -1,313 +0,0 @@
import logging
import os
import threading
import time
from typing import Optional
import flask
import requests
from werkzeug.datastructures import FileStorage
app = flask.Flask(__name__)
heartbeat_active = False
log = logging.getLogger(__name__)
log.propagate = False
level = logging.INFO
log.setLevel(level)
formatter = logging.Formatter(
"[%(asctime)s] %(levelname)s : %(pathname)s:%(lineno)d - %(message)s",
"%Y-%m-%d %H:%M:%S",
)
streamHandler = logging.StreamHandler()
streamHandler.setLevel(level)
streamHandler.setFormatter(formatter)
log.addHandler(streamHandler)
def heartbeat(url):
global heartbeat_active
if heartbeat_active:
return
heartbeat_active = True
while True:
try:
requests.post(url, json={"status": "RUNNING"})
except Exception:
pass
time.sleep(10)
def asr(
audio_file: FileStorage,
language: Optional[str],
progressCallbackUrl: str,
taskId: str,
):
"""TODO: 读取audio_file, 调用语音识别服务, 实时返回识别结果"""
# ignore BEGIN
# 此处为榜单本地测试使用
if os.getenv("LOCAL_TEST"):
return local_test(progressCallbackUrl, taskId)
# ignore END
language = "de"
# 某一次识别返回
requests.post(
progressCallbackUrl,
json={
"taskId": taskId,
"status": "RUNNING",
"recognition_results": { # 传增量结果, status如果是FINISHED, 或者ERROR, 这个字段请不要传值
"text": "最先启动的还是",
"final_result": True,
"para_seq": 0,
"language": language,
"start_time": 6300,
"end_time": 6421,
"words": [
{
"text": "",
"start_time": 6300,
"end_time": 6321,
},
{
"text": "",
"start_time": 6321,
"end_time": 6345,
},
{
"text": "",
"start_time": 6345,
"end_time": 6350,
},
{
"text": "",
"start_time": 6350,
"end_time": 6370,
},
{
"text": "",
"start_time": 6370,
"end_time": 6386,
},
{
"text": "",
"start_time": 6386,
"end_time": 6421,
},
{
"text": "",
"start_time": 6421,
"end_time": 6435,
},
],
},
},
)
# ... 识别结果返回完毕
# 识别结束
requests.post(
progressCallbackUrl,
json={
"taskId": taskId,
"status": "FINISHED",
},
)
@app.post("/predict")
def predict():
body = flask.request.form
language = body.get("language")
if language is None:
"自行判断语种"
taskId = body["taskId"]
progressCallbackUrl = body["progressCallbackUrl"]
heartbeatUrl = body["heartbeatUrl"]
threading.Thread(
target=heartbeat, args=(heartbeatUrl,), daemon=True
).start()
audio_file = flask.request.files["file"]
# audio_file.stream # 读取文件流
# audio_file.save("audio.mp3") # 保存文件
threading.Thread(
target=asr,
args=(audio_file, language, progressCallbackUrl, taskId),
daemon=True,
).start()
return flask.jsonify({"status": "OK"})
# ignore BEGIN
def local_test(progressCallbackUrl: str, taskId: str):
"""忽略此方法, 此方法为榜单本地调试使用"""
import random
import re
import yaml
def callback(content):
try:
if content is None:
requests.post(
progressCallbackUrl,
json={"taskId": taskId, "status": "FINISHED"},
)
else:
requests.post(
progressCallbackUrl,
json={
"taskId": taskId,
"status": "RUNNING",
"recognition_results": content,
},
)
except Exception:
pass
with open(
os.getenv("LOCAL_TEST_DATA_PATH", "../dataset/out/data.yaml")
) as f:
data = yaml.full_load(f)
voices = data["query_data"][0]["voice"]
# 首次发送
first_send_time = random.randint(3, 5)
send_interval = random.random() * 0
log.info("首次发送%ss 发送间隔%ss" % (first_send_time, send_interval))
time.sleep(first_send_time)
# 将句子拼接到一起
if random.random() < 0.3:
log.info("将部分句子合并成单句 每次合并的句子不超过3句")
rand_idx = 0
rand_sep = [0, len(voices) - 1]
while rand_sep[rand_idx] + 1 <= rand_sep[rand_idx + 1] - 1:
rand_cursep = random.randint(
rand_sep[rand_idx] + 1,
min(rand_sep[rand_idx + 1] - 1, rand_sep[rand_idx] + 1 + 3),
)
rand_sep.insert(rand_idx + 1, rand_cursep)
rand_idx += 1
merged_voices = []
for i, cur_sep in enumerate(rand_sep[:-1]):
voice = voices[cur_sep]
for j in range(cur_sep + 1, rand_sep[i + 1]):
voice["answer"] += voices[j]["answer"]
voice["end"] = voices[j]["end"]
merged_voices.append(voice)
merged_voices.append(voices[rand_sep[-1]])
voices = merged_voices
def split_and_keep(text, delimiters):
# 构建正则表达式模式,匹配文本或分隔符
pattern = "|".join(re.escape(delimiter) for delimiter in delimiters)
pattern = f"(?:[^{pattern}]+|[{pattern}])"
return re.findall(pattern, text)
puncs = [",", ".", "?", "!", ";", ":"]
para_seq = 0
for voice in voices:
answer: str = voice["answer"]
start_time: float = voice["start"]
end_time: float = voice["end"]
words = split_and_keep(answer, puncs)
temp_words = []
for i, word in enumerate(words):
if i > 0 and i < len(words) - 1 and random.random() < 0.15:
log.info("随机删除word")
continue
temp_words.extend(word.split(" "))
if len(temp_words) == 0:
temp_words = words[0].split(" ")
words = temp_words
answer = " ".join(words)
words = list(map(lambda x: x.strip(), words))
words = list(filter(lambda x: len(x) > 0, words))
# 将时间均匀分配到每个字上
words_withtime = []
word_unittime = (end_time - start_time) / len(words)
for i, word in enumerate(words):
word_start = start_time + word_unittime * i
word_end = word_start + word_unittime
words_withtime.append(
{
"text": word,
"start_time": word_start * 1000,
"end_time": word_end * 1000,
}
)
# 将句子首尾的标点符号时间扩展到字上 标点符号时间为瞬间
punc_at = 0
while punc_at < len(words) and words[punc_at] in puncs:
punc_at += 1
if punc_at < len(words):
words_withtime[punc_at]["start_time"] = words_withtime[0][
"start_time"
]
for i in range(0, punc_at):
words_withtime[i]["start_time"] = words_withtime[0]["start_time"]
words_withtime[i]["end_time"] = words_withtime[0]["start_time"]
punc_at = len(words) - 1
while punc_at >= 0 and words[punc_at] in puncs:
punc_at -= 1
if punc_at >= 0:
words_withtime[punc_at]["end_time"] = words_withtime[-1]["end_time"]
for i in range(punc_at + 1, len(words)):
words_withtime[i]["start_time"] = (
words_withtime[-1]["end_time"] + 0.1
)
words_withtime[i]["end_time"] = words_withtime[-1]["end_time"] + 0.1
if random.random() < 0.4 and len(words_withtime) > 1:
log.info("发送一次final_result=False")
rand_idx = random.randint(1, len(words_withtime) - 1)
recognition_result = {
"text": " ".join(
map(lambda x: x["text"], words_withtime[:rand_idx])
),
"final_result": False,
"para_seq": para_seq,
"language": "de",
"start_time": start_time * 1000,
"end_time": end_time * 1000,
"words": words_withtime[:rand_idx],
}
callback(recognition_result)
recognition_result = {
"text": answer,
"final_result": True,
"para_seq": para_seq,
"language": "de",
"start_time": start_time * 1000,
"end_time": end_time * 1000,
"words": words_withtime,
}
callback(recognition_result)
para_seq += 1
log.info("send %s" % para_seq)
time.sleep(send_interval)
callback(None)
# ignore END
if __name__ == "__main__":
app.run(host="0.0.0.0", port=80)

View File

@@ -1,3 +0,0 @@
flask
requests
pyyaml

View File

@@ -1,16 +0,0 @@
import json
from schemas.dataset import QueryData
from schemas.stream import StreamDataModel
from utils.evaluator_plus import evaluate_editops
with open("out/detail_cases.json") as f:
detail_cases = json.load(f)
detail_case = detail_cases[0]
preds = []
for pred in detail_case["preds"]:
preds.append(StreamDataModel.model_validate(pred))
label = QueryData.model_validate(detail_case["label"])
print(evaluate_editops(label, preds))

View File

@@ -1,93 +0,0 @@
"""
f(a, b) 计算 a -> b 的编辑距离使用的方法是之前asr榜单的方法
g(a, b) 计算 a -> b 的编辑距离,使用的是原始的编辑距离计算方法
test() 是对拍程序
"""
import random
import string
from copy import deepcopy
from typing import List, Tuple
import Levenshtein
def mapping(gt: str, dt: str):
return [i for i in gt], [i for i in dt]
def token_mapping(
tokens_gt: List[str], tokens_dt: List[str]
) -> Tuple[List[str], List[str]]:
arr1 = deepcopy(tokens_gt)
arr2 = deepcopy(tokens_dt)
operations = Levenshtein.editops(arr1, arr2)
for op in operations[::-1]:
if op[0] == "insert":
arr1.insert(op[1], None)
elif op[0] == "delete":
arr2.insert(op[2], None)
return arr1, arr2
def cer(tokens_gt_mapping: List[str], tokens_dt_mapping: List[str]):
"""输入的是经过编辑距离映射后的两个 token 序列,返回 1-cer, token-cnt"""
insert = sum(1 for item in tokens_gt_mapping if item is None)
delete = sum(1 for item in tokens_dt_mapping if item is None)
equal = sum(
1
for token_gt, token_dt in zip(tokens_gt_mapping, tokens_dt_mapping)
if token_gt == token_dt
)
replace = len(tokens_gt_mapping) - insert - equal # - delete
return replace, delete, insert
def f(a, b):
return cer(*token_mapping(*mapping(a, b)))
def raw(tokens_gt, tokens_dt):
arr1 = deepcopy(tokens_gt)
arr2 = deepcopy(tokens_dt)
operations = Levenshtein.editops(arr1, arr2)
insert = 0
delete = 0
replace = 0
for op in operations:
if op[0] == "insert":
insert += 1
if op[0] == "delete":
delete += 1
if op[0] == "replace":
replace += 1
return replace, delete, insert
def g(a, b):
return raw(*mapping(a, b))
def check(a, b):
ff = f(a, b)
gg = g(a, b)
if ff != gg:
print(ff, gg)
return ff == gg
def random_string(length):
letters = string.ascii_lowercase
return "".join(random.choice(letters) for i in range(length))
def test():
for _ in range(10000):
a = random_string(30)
b = random_string(30)
if not check(a, b):
print(a, b)
break
test()