Track token scores (#571)

* add export of per-token scores (ys, lm, context) - for best path of the modified-beam-search decoding of transducer * refactoring JSON export of OnlineRecognitionResult, extending pybind11 API of OnlineRecognitionResult * export per-token scores also for greedy-search (online-transducer) - export un-scaled lm_probs (modified-beam search, online-transducer) - polishing * fill lm_probs/context_scores only if LM/ContextGraph is present (make Result smaller)
2024-02-28 23:28:45 +01:00
parent 85d59b5840
commit 38c072dcb2
11 changed files with 155 additions and 49 deletions
--- a/sherpa-onnx/csrc/online-recognizer.h
+++ b/sherpa-onnx/csrc/online-recognizer.h
@@ -40,6 +40,12 @@ struct OnlineRecognizerResult {
  /// timestamps[i] records the time in seconds when tokens[i] is decoded.
  std::vector<float> timestamps;

+  std::vector<float> ys_probs;  //< log-prob scores from ASR model
+  std::vector<float> lm_probs;  //< log-prob scores from language model
+                                //
+  /// log-domain scores from "hot-phrase" contextual boosting
+  std::vector<float> context_scores;
+
  /// ID of this segment
  /// When an endpoint is detected, it is incremented
  int32_t segment = 0;
@@ -58,6 +64,9 @@ struct OnlineRecognizerResult {
   *     "text": "The recognition result",
   *     "tokens": [x, x, x],
   *     "timestamps": [x, x, x],
+   *     "ys_probs": [x, x, x],
+   *     "lm_probs": [x, x, x],
+   *     "context_scores": [x, x, x],
   *     "segment": x,
   *     "start_time": x,
   *     "is_final": true|false