First Commit

This commit is contained in:
2025-08-05 19:09:06 +08:00
commit d236601521
7 changed files with 498 additions and 0 deletions

14
Dockerfile Normal file
View File

@@ -0,0 +1,14 @@
FROM zibo.harbor.iluvatar.com.cn:30000/saas/bi100-3.2.1-x86-ubuntu20.04-py3.10-poc-llm-infer:v1.2.2
RUN pip install --no-cache-dir torch==2.1.0+corex.3.2.1 torchaudio==2.1.0+corex.3.2.1 pyannote.audio av
COPY ./pyannote_models/pyannote-wespeaker-voxceleb-resnet34-LM /model
COPY ./src/filesystem_storage.py /workspace/filesystem_storage.py
COPY ./src/speaker_identification.py /workspace/speaker_identification.py
COPY ./src/iflytek_interface_server.py /workspace/iflytek_interface_server.py
COPY ./launch_service /workspace/launch_service
WORKDIR /workspace/
ENTRYPOINT ["./launch_service"]

9
launch_service Executable file
View File

@@ -0,0 +1,9 @@
#!/bin/bash
date
cat /proc/cpuinfo | tail -n 50
ixsmi
export
date
python3 iflytek_interface_server.py

View File

@@ -0,0 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

Binary file not shown.

56
src/filesystem_storage.py Normal file
View File

@@ -0,0 +1,56 @@
import os
import shutil
import logging
logger = logging.getLogger(__name__)
BASE_DIRECTORY = './embedding_data'
class FileSystemStorage:
def __init__(self, base_directory=BASE_DIRECTORY):
self.base_dir = base_directory
def create_group(self, auth_token, group_id):
path = os.path.join(self.base_dir, group_id)
if os.path.exists(path):
raise FileExistsError(f"{group_id}")
else:
try:
os.makedirs(path)
except OSError as e:
logger.error(f"Error creating directory {path}: {e}")
raise
def get(self, auth_token, group_id, item_id):
try:
path = os.path.join(self.base_dir, group_id, item_id)
with open(path, 'rb') as f:
return f.read()
except Exception as e:
logger.error(f"Error reading from {path}: {e}")
raise
def save(self, auth_token, group_id, item_id, content):
try:
path = os.path.join(self.base_dir, group_id, item_id)
with open(path, 'wb') as f:
f.write(content)
except Exception as e:
logger.error(f"Error saving to {path}: {e}")
raise
def remove(slef, auth_token, group_id, item_id):
try:
path = os.path.join(self.base_dir, group_id, item_id)
os.remove(path)
except Exception as e:
logger.error(f"Error remove item {path}: {e}")
raise
def remove_group(self, auth_token, group_id):
try:
path = os.path.join(self.base_dir, group_id)
shutil.rmtree(path)
except Exception as e:
logger.error(f"Error remove group {path}: {e}")
raise

View File

@@ -0,0 +1,212 @@
import av
import io
import time
import json
import uuid
import torch
import numpy
import base64
import logging
from flask import Flask, request, jsonify
format_str = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
datefmt= '%Y-%m-%d %H:%M:%S'
logging.basicConfig(level=logging.WARNING, format=format_str, datefmt=datefmt)
from speaker_identification import init_embedding_model, create_group, enroll_speaker, identify_speaker, calc_similarity, list_speakers, remove_speaker, remove_group
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
init_embedding_model()
app = Flask(__name__)
def samples_from_raw_bytes(data):
ret = numpy.frombuffer(data, dtype=numpy.int16).astype(numpy.float32) / 32768.0
return ret
def samples_from_lame_bytes(data):
input_buffer = io.BytesIO(data)
with av.open(input_buffer, mode='r') as container:
stream = next(s for s in container.streams if s.type == 'audio')
resampler = av.AudioResampler(
format='fltp',
layout='mono',
rate=16000
)
frame_chunks = []
for packet in container.demux(stream):
for frame in packet.decode():
for resampled_frame in resampler.resample(frame):
frame_chunks.append(resampled_frame.to_ndarray().flatten())
for resampled_frame in resampler.resample(None):
frame_chunks.append(resampled_frame.to_ndarray().flatten())
if not frame_chunks:
return numpy.array([], dtype=numpy.float32)
return numpy.concatenate(frame_chunks)
def waveform_input_from_b64_audio(audio_b64, audio_format):
audio_data = base64.b64decode(audio_b64)
if audio_format == 'lame':
samples = samples_from_lame_bytes(audio_data)
else:
samples = samples_from_raw_bytes(audio_data)
waveform = torch.from_numpy(numpy.expand_dims(samples, axis=0))
ret = {'waveform': waveform, 'sample_rate': 16000}
return ret
def conv_group_id(aid, gid):
group_id = f'{aid}_____{gid}'
return group_id
def process_create_group(req_json):
params = req_json['parameter']['s782b4996']
aid = req_json['header']['app_id']
gid = params['groupId']
group_id = conv_group_id(aid, gid)
create_group(group_id)
resp = {
'groupName': f'{gid} (groupId)',
'groupId': gid,
'groupInfo': f'{gid} (groupId)'
}
return resp
def process_create_feature(req_json):
params = req_json['parameter']['s782b4996']
aid = req_json['header']['app_id']
gid = params['groupId']
group_id = conv_group_id(aid, gid)
speaker_id = params['featureId']
audio_b64 = req_json['payload']['resource']['audio']
audio_format = req_json['payload']['resource']['encoding']
audio = waveform_input_from_b64_audio(audio_b64, audio_format)
enroll_speaker(group_id, speaker_id, audio)
resp = {'featureId': speaker_id}
return resp
def process_feature_list(req_json):
params = req_json['parameter']['s782b4996']
aid = req_json['header']['app_id']
gid = params['groupId']
group_id = conv_group_id(aid, gid)
speaker_ids = list_speakers(group_id)
resp = [{'featureInfo': f'{speaker_id} (featureId)',
'featureId': sid}
for sid in speaker_ids]
return resp
def process_score_feature(req_json):
params = req_json['parameter']['s782b4996']
aid = req_json['header']['app_id']
gid = params['groupId']
group_id = conv_group_id(aid, gid)
speaker_id = params['dstFeatureId']
audio_b64 = req_json['payload']['resource']['audio']
audio_format = req_json['payload']['resource']['encoding']
audio = waveform_input_from_b64_audio(audio_b64, audio_format)
similarity = calc_similarity(audio, group_id, speaker_id)
resp = {
'score': similarity,
'featureInfo': f'{speaker_id} (featureId)',
'featureId': speaker_id,
}
return resp
def process_search_feature(req_json):
params = req_json['parameter']['s782b4996']
aid = req_json['header']['app_id']
gid = params['groupId']
group_id = conv_group_id(aid, gid)
top_k = params['topK']
audio_b64 = req_json['payload']['resource']['audio']
audio_format = req_json['payload']['resource']['encoding']
audio = waveform_input_from_b64_audio(audio_b64, audio_format)
iden_res = identify_speaker(audio, group_id, top_k)
score_list = []
for score, speaker_id in iden_res:
score_item = {
'score': score,
'featureInfo': f'{speaker_id} (featureId)',
'featureId': speaker_id
}
score_list.append(score_item)
resp = {'scoreList': score_list}
return resp
def process_delete_feature(req_json):
params = req_json['parameter']['s782b4996']
aid = req_json['header']['app_id']
gid = params['groupId']
group_id = conv_group_id(aid, gid)
speaker_id = params['featureId']
remove_speaker(group_id, speaker_id)
resp = {"msg": "success"}
return resp
def process_delete_group(req_json):
params = req_json['parameter']['s782b4996']
aid = req_json['header']['app_id']
gid = params['groupId']
group_id = conv_group_id(aid, gid)
remove_group(group_id, speaker_id)
resp = {"msg": "success"}
return resp
def generate_interface_response(success, resp, req_id):
resp_b64 = base64.b64encode(json.dumps(resp).encode('utf-8')).decode('utf-8')
ret = {
"header": {
"code": 0 if success else 10009,
"message": "success",
"sid": req_id,
},
"payload": {
"updateFeatureRes": {
"status": "3",
"text": resp_b64
}
},
}
return ret
@app.route('/v1/private/s782b4996', methods=['POST'])
def s782b4996():
req_id = str(uuid.uuid4())
try:
req_json = request.json
func = req_json['parameter']['s782b4996']['func']
logger.info(f'Processing request {func=}, {req_id=}...')
resp = None
ts_beg = time.time()
if func == 'createGroup':
resp = process_create_group(req_json)
elif func == 'createFeature':
resp = process_create_feature(req_json)
elif func == 'queryFeatureList':
resp = process_feature_list(req_json)
elif func == 'searchScoreFea':
resp = process_score_feature(req_json)
elif func == 'searchFea':
resp = process_search_feature(req_json)
elif func == 'deleteFeature':
resp = process_delete_feature(req_json)
elif func == 'deleteGroup':
resp = process_delete_group(req_json)
elapsed = time.time() - ts_beg
logger.debug(f'{elapsed=:.3f}s Result = {resp}')
logger.info(f'Request {req_id} completed.')
ret = generate_interface_response(True, resp, req_id)
except Exception as e:
logger.warning(f'Exception {e}', exc_info=True)
msg = {'error_msg': str(e)}
ret = generate_interface_response(False, msg, req_id)
return ret
@app.route('/health')
@app.route('/health_check')
def health():
return {'status': 'ok'}
if __name__ == '__main__':
app.run(host='0.0.0.0', port=80)

View File

@@ -0,0 +1,172 @@
import time
import torch
import pickle
import logging
import hashlib
import numpy as np
from pyannote.audio import Model, Inference
from filesystem_storage import FileSystemStorage
logger = logging.getLogger(__name__)
STOR_AUTH_TOKEN = ''
EMB_MODEL_PATH = "/model/pytorch_model.bin"
embedding_model = None
storage = FileSystemStorage()
class SpeakerIDException(Exception):
def get_err_msg(self):
return str(self)
def _safe_id(id):
original_id = str(id)
allowed_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
cleaned_chars = (c if c in allowed_chars else '_' for c in original_id)
cleaned_id = "".join(cleaned_chars)
truncated_id = cleaned_id[:300]
hasher = hashlib.sha256()
hasher.update(original_id.encode('utf-8'))
full_hash = hasher.hexdigest()
short_hash = full_hash[:16]
final_id = f"{truncated_id}_{short_hash}"
return final_id
def _calc_embedding(audio):
global embedding_model
try:
embedding = embedding_model(audio)
except Exception as e:
logger.error(f'Could not calc embedding: inference audio error {e}')
raise
return embedding
def _cosine_similarity(u, v):
u = np.asarray(u)
v = np.asarray(v)
if np.linalg.norm(u) == 0 or np.linalg.norm(v) == 0:
logger.warning("Warning: One or both vectors are zero vectors. Cosine distance is undefined.")
return 1.0
dot_product = np.dot(u, v)
norm_u = np.linalg.norm(u)
norm_v = np.linalg.norm(v)
similarity = float(dot_product / (norm_u * norm_v))
similarity = max(0.0, min(similarity, 1.0))
return similarity
def _load_group(group_id):
try:
emb_content = storage.get(STOR_AUTH_TOKEN, group_id, 'embeddings')
embeddings = pickle.loads(emb_content)
except Exception as e:
logger.error(f'Could not load group with id {group_id}, err = {e}')
raise
return embeddings
def _save_group(group_id, embeddings):
global storage
try:
emb_content = pickle.dumps(embeddings)
storage.save(STOR_AUTH_TOKEN, group_id, 'embeddings', emb_content)
except Exception as e:
logger.error(f'Could not save group with id {group_id}, err = {e}')
raise
def _create_group(group_id):
try:
storage.create_group(STOR_AUTH_TOKEN, group_id_s)
_save_group(group_id_s, {})
except Exception as e:
logger.error(f'Could not create group with id {group_id}, err = {e}')
raise
# Public Functions
def init_embedding_model():
global embedding_model
model = Model.from_pretrained(EMB_MODEL_PATH)
embedding_model = Inference(model, window="whole")
embedding_model.to(torch.device('cuda'))
def create_group(group_id):
global storage
try:
group_id_s = _safe_id(group_id)
storage.create_group(STOR_AUTH_TOKEN, group_id_s)
_save_group(group_id_s, {})
except Exception as e:
logger.error(f'Could not create group with id {group_id}, err = {e}')
raise SpeakerIDException(f'Create Group failed')
def enroll_speaker(group_id, speaker_id, audio):
try:
group_id_s = _safe_id(group_id)
speaker_id_s = _safe_id(speaker_id)
embeddings = _load_group(group_id_s)
speaker_emb = _calc_embedding(audio)
embeddings[speaker_id_s] = (speaker_id, speaker_emb)
_save_group(group_id_s, embeddings)
except Exception as e:
logger.error(f'Could enroll speaker with {group_id=} {speaker_id=}, err = {e}')
raise SpeakerIDException(f'Enroll Speaker failed')
def list_speakers(group_id):
try:
group_id_s = _safe_id(group_id)
embeddings = _load_group(group_id_s)
speaker_ids = [x[0] for x in embeddings.values()]
return speaker_ids
except Exception as e:
logger.error(f'Could not list speakers. {group_id=}, err = {e}')
raise SpeakerIDException(f'List Speakers failed')
def remove_speaker(group_id, speaker_id):
try:
group_id_s = _safe_id(group_id)
speaker_id_s = _safe_id(speaker_id)
embeddings = _load_group(group_id_s)
embeddings.pop(speaker_id_s)
_save_group(group_id_s, embeddings)
except Exception as e:
logger.error(f'Could not remove speaker. {group_id=} {speaker_id = }, err = {e}')
raise SpeakerIDException(f'Remove Speaker failed')
def remove_group(group_id):
try:
group_id_s = _safe_id(group_id)
storage.remove_group(STOR_AUTH_TOKEN, group_id_s)
except Exception as e:
logger.error(f'Could not remove group {group_id}, err = {e}')
raise SpeakerIDException(f'Remove Group failed')
def calc_similarity(audio, group_id, speaker_id):
try:
group_id_s = _safe_id(group_id)
speaker_id_s = _safe_id(speaker_id)
embeddings = _load_group(group_id_s)
speaker_emb = embeddings[speaker_id_s][1]
audio_emb = _calc_embedding(audio)
similarity = _cosine_similarity(speaker_emb, audio_emb)
return similarity
except Exception as e:
logger.error(f'Could not calculate similarity. {group_id=} {speaker_id=}: {e}')
raise SpeakerIDException(f'Calculate Similarity failed')
def identify_speaker(audio, group_id, top_k):
try:
group_id_s = _safe_id(group_id)
embeddings = _load_group(group_id_s)
audio_emb = _calc_embedding(audio)
ret_lst = []
for i, e in embeddings.values():
similarity = _cosine_similarity(audio_emb, e)
ret_lst.append((similarity, i))
ret_lst.sort(reverse=True)
ret_lst = ret_lst[:top_k]
return ret_lst
except Exception as e:
logger.error(f'Could not identify speaker. {group_id=}: {e}')
raise SpeakerIDException(f'Identify Speaker failed')