142 lines
3.6 KiB
Python
Executable File
142 lines
3.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
|
|
# Please run this file on your rk3588 board
|
|
|
|
try:
|
|
from rknnlite.api import RKNNLite
|
|
except:
|
|
print("Please run this file on your board (linux + aarch64 + npu)")
|
|
print("You need to install rknn_toolkit_lite2")
|
|
print(
|
|
" from https://github.com/airockchip/rknn-toolkit2/tree/master/rknn-toolkit-lite2/packages"
|
|
)
|
|
print(
|
|
"https://github.com/airockchip/rknn-toolkit2/blob/v2.1.0/rknn-toolkit-lite2/packages/rknn_toolkit_lite2-2.1.0-cp310-cp310-linux_aarch64.whl"
|
|
)
|
|
print("is known to work")
|
|
raise
|
|
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Tuple
|
|
|
|
import numpy as np
|
|
import soundfile as sf
|
|
|
|
|
|
def load_audio(filename: str) -> Tuple[np.ndarray, int]:
|
|
data, sample_rate = sf.read(
|
|
filename,
|
|
always_2d=True,
|
|
dtype="float32",
|
|
)
|
|
data = data[:, 0] # use only the first channel
|
|
|
|
samples = np.ascontiguousarray(data)
|
|
return samples, sample_rate
|
|
|
|
|
|
def init_model(filename, target_platform="rk3588"):
|
|
if not Path(filename).is_file():
|
|
exit(f"{filename} does not exist")
|
|
|
|
rknn_lite = RKNNLite(verbose=False)
|
|
ret = rknn_lite.load_rknn(path=filename)
|
|
if ret != 0:
|
|
exit(f"Load model {filename} failed!")
|
|
|
|
ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_0)
|
|
if ret != 0:
|
|
exit(f"Failed to init rknn runtime for {filename}")
|
|
return rknn_lite
|
|
|
|
|
|
class RKNNModel:
|
|
def __init__(self, model: str, target_platform="rk3588"):
|
|
self.model = init_model(model)
|
|
|
|
def release(self):
|
|
self.model.release()
|
|
|
|
def __call__(self, x: np.ndarray, h: np.ndarray, c: np.ndarray):
|
|
"""
|
|
Args:
|
|
x: (1, 512), np.float32
|
|
h: (2, 1, 64), np.float32
|
|
c: (2, 1, 64), np.float32
|
|
Returns:
|
|
prob:
|
|
next_h:
|
|
next_c
|
|
"""
|
|
out, next_h, next_c = self.model.inference(inputs=[x, h, c])
|
|
return out.item(), next_h, next_c
|
|
|
|
|
|
def main():
|
|
model = RKNNModel(model="./m.rknn")
|
|
for i in range(1):
|
|
test(model)
|
|
|
|
|
|
def test(model):
|
|
print("started")
|
|
start = time.time()
|
|
samples, sample_rate = load_audio("./lei-jun-test.wav")
|
|
assert sample_rate == 16000, sample_rate
|
|
|
|
window_size = 512
|
|
|
|
h = np.zeros((2, 1, 64), dtype=np.float32)
|
|
c = np.zeros((2, 1, 64), dtype=np.float32)
|
|
|
|
threshold = 0.5
|
|
num_windows = samples.shape[0] // window_size
|
|
out = []
|
|
for i in range(num_windows):
|
|
print(i, num_windows)
|
|
this_samples = samples[i * window_size : (i + 1) * window_size]
|
|
prob, h, c = model(this_samples[None], h, c)
|
|
out.append(prob > threshold)
|
|
|
|
min_speech_duration = 0.25 * sample_rate / window_size
|
|
min_silence_duration = 0.25 * sample_rate / window_size
|
|
|
|
result = []
|
|
last = -1
|
|
for k, f in enumerate(out):
|
|
if f >= threshold:
|
|
if last == -1:
|
|
last = k
|
|
elif last != -1:
|
|
if k - last > min_speech_duration:
|
|
result.append((last, k))
|
|
last = -1
|
|
|
|
if last != -1 and k - last > min_speech_duration:
|
|
result.append((last, k))
|
|
|
|
if not result:
|
|
print("Empty for ./lei-jun-test.wav")
|
|
return
|
|
|
|
print(result)
|
|
|
|
final = [result[0]]
|
|
for r in result[1:]:
|
|
f = final[-1]
|
|
if r[0] - f[1] < min_silence_duration:
|
|
final[-1] = (f[0], r[1])
|
|
else:
|
|
final.append(r)
|
|
|
|
for f in final:
|
|
start = f[0] * window_size / sample_rate
|
|
end = f[1] * window_size / sample_rate
|
|
print("{:.3f} -- {:.3f}".format(start, end))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|