snowboy-detect.h 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. // include/snowboy-detect.h
  2. // Copyright 2016 KITT.AI (author: Guoguo Chen)
  3. #ifndef SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_
  4. #define SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_
  5. #include <memory>
  6. #include <string>
  7. namespace snowboy {
  8. // Forward declaration.
  9. struct WaveHeader;
  10. class PipelineDetect;
  11. class PipelineVad;
  12. ////////////////////////////////////////////////////////////////////////////////
  13. //
  14. // SnowboyDetect class interface.
  15. //
  16. ////////////////////////////////////////////////////////////////////////////////
  17. class SnowboyDetect {
  18. public:
  19. // Constructor that takes a resource file, and a list of hotword models which
  20. // are separated by comma. In the case that more than one hotword exist in the
  21. // provided models, RunDetection() will return the index of the hotword, if
  22. // the corresponding hotword is triggered.
  23. //
  24. // CAVEAT: a personal model only contain one hotword, but an universal model
  25. // may contain multiple hotwords. It is your responsibility to figure
  26. // out the index of the hotword. For example, if your model string is
  27. // "foo.pmdl,bar.umdl", where foo.pmdl contains hotword x, bar.umdl
  28. // has two hotwords y and z, the indices of different hotwords are as
  29. // follows:
  30. // x 1
  31. // y 2
  32. // z 3
  33. //
  34. // @param [in] resource_filename Filename of resource file.
  35. // @param [in] model_str A string of multiple hotword models,
  36. // separated by comma.
  37. SnowboyDetect(const std::string& resource_filename,
  38. const std::string& model_str);
  39. // Resets the detection. This class handles voice activity detection (VAD)
  40. // internally. But if you have an external VAD, you should call Reset()
  41. // whenever you see segment end from your VAD.
  42. bool Reset();
  43. // Runs hotword detection. Supported audio format is WAVE (with linear PCM,
  44. // 8-bits unsigned integer, 16-bits signed integer or 32-bits signed integer).
  45. // See SampleRate(), NumChannels() and BitsPerSample() for the required
  46. // sampling rate, number of channels and bits per sample values. You are
  47. // supposed to provide a small chunk of data (e.g., 0.1 second) each time you
  48. // call RunDetection(). Larger chunk usually leads to longer delay, but less
  49. // CPU usage.
  50. //
  51. // Definition of return values:
  52. // -2: Silence.
  53. // -1: Error.
  54. // 0: No event.
  55. // 1: Hotword 1 triggered.
  56. // 2: Hotword 2 triggered.
  57. // ...
  58. //
  59. // @param [in] data Small chunk of data to be detected. See
  60. // above for the supported data format.
  61. // @param [in] is_end Set it to true if it is the end of a
  62. // utterance or file.
  63. int RunDetection(const std::string& data, bool is_end = false);
  64. // Various versions of RunDetection() that take different format of audio. If
  65. // NumChannels() > 1, e.g., NumChannels() == 2, then the array is as follows:
  66. //
  67. // d1c1, d1c2, d2c1, d2c2, d3c1, d3c2, ..., dNc1, dNc2
  68. //
  69. // where d1c1 means data point 1 of channel 1.
  70. //
  71. // @param [in] data Small chunk of data to be detected. See
  72. // above for the supported data format.
  73. // @param [in] array_length Length of the data array.
  74. // @param [in] is_end Set it to true if it is the end of a
  75. // utterance or file.
  76. int RunDetection(const float* const data,
  77. const int array_length, bool is_end = false);
  78. int RunDetection(const int16_t* const data,
  79. const int array_length, bool is_end = false);
  80. int RunDetection(const int32_t* const data,
  81. const int array_length, bool is_end = false);
  82. // Sets the sensitivity string for the loaded hotwords. A <sensitivity_str> is
  83. // a list of floating numbers between 0 and 1, and separated by comma. For
  84. // example, if there are 3 loaded hotwords, your string should looks something
  85. // like this:
  86. // 0.4,0.5,0.8
  87. // Make sure you properly align the sensitivity value to the corresponding
  88. // hotword.
  89. void SetSensitivity(const std::string& sensitivity_str);
  90. // Similar to the sensitivity setting above. When set higher than the above
  91. // sensitivity, the algorithm automatically chooses between the normal
  92. // sensitivity set above and the higher sensitivity set here, to maximize the
  93. // performance. By default, it is not set, which means the algorithm will
  94. // stick with the sensitivity set above.
  95. void SetHighSensitivity(const std::string& high_sensitivity_str);
  96. // Returns the sensitivity string for the current hotwords.
  97. std::string GetSensitivity() const;
  98. // Applied a fixed gain to the input audio. In case you have a very weak
  99. // microphone, you can use this function to boost input audio level.
  100. void SetAudioGain(const float audio_gain);
  101. // Writes the models to the model filenames specified in <model_str> in the
  102. // constructor. This overwrites the original model with the latest parameter
  103. // setting. You are supposed to call this function if you have updated the
  104. // hotword sensitivities through SetSensitivity(), and you would like to store
  105. // those values in the model as the default value.
  106. void UpdateModel() const;
  107. // Returns the number of the loaded hotwords. This helps you to figure the
  108. // index of the hotwords.
  109. int NumHotwords() const;
  110. // If <apply_frontend> is true, then apply frontend audio processing;
  111. // otherwise turns the audio processing off. Frontend audio processing
  112. // includes algorithms such as automatic gain control (AGC), noise suppression
  113. // (NS) and so on. Generally adding frontend audio processing helps the
  114. // performance, but if the model is not trained with frontend audio
  115. // processing, it may decrease the performance. The general rule of thumb is:
  116. // 1. For personal models, set it to false.
  117. // 2. For universal models, follow the instruction of each published model
  118. void ApplyFrontend(const bool apply_frontend);
  119. // Returns the required sampling rate, number of channels and bits per sample
  120. // values for the audio data. You should use this information to set up your
  121. // audio capturing interface.
  122. int SampleRate() const;
  123. int NumChannels() const;
  124. int BitsPerSample() const;
  125. ~SnowboyDetect();
  126. private:
  127. std::unique_ptr<WaveHeader> wave_header_;
  128. std::unique_ptr<PipelineDetect> detect_pipeline_;
  129. };
  130. ////////////////////////////////////////////////////////////////////////////////
  131. //
  132. // SnowboyVad class interface.
  133. //
  134. ////////////////////////////////////////////////////////////////////////////////
  135. class SnowboyVad {
  136. public:
  137. // Constructor that takes a resource file. It shares the same resource file
  138. // with SnowboyDetect.
  139. SnowboyVad(const std::string& resource_filename);
  140. // Resets the VAD.
  141. bool Reset();
  142. // Runs the VAD algorithm. Supported audio format is WAVE (with linear PCM,
  143. // 8-bits unsigned integer, 16-bits signed integer or 32-bits signed integer).
  144. // See SampleRate(), NumChannels() and BitsPerSample() for the required
  145. // sampling rate, number of channels and bits per sample values. You are
  146. // supposed to provide a small chunk of data (e.g., 0.1 second) each time you
  147. // call RunDetection(). Larger chunk usually leads to longer delay, but less
  148. // CPU usage.
  149. //
  150. // Definition of return values:
  151. // -2: Silence.
  152. // -1: Error.
  153. // 0: Non-silence.
  154. //
  155. // @param [in] data Small chunk of data to be detected. See
  156. // above for the supported data format.
  157. // @param [in] is_end Set it to true if it is the end of a
  158. // utterance or file.
  159. int RunVad(const std::string& data, bool is_end = false);
  160. // Various versions of RunVad() that take different format of audio. If
  161. // NumChannels() > 1, e.g., NumChannels() == 2, then the array is as follows:
  162. //
  163. // d1c1, d1c2, d2c1, d2c2, d3c1, d3c2, ..., dNc1, dNc2
  164. //
  165. // where d1c1 means data point 1 of channel 1.
  166. //
  167. // @param [in] data Small chunk of data to be detected. See
  168. // above for the supported data format.
  169. // @param [in] array_length Length of the data array.
  170. // @param [in] is_end Set it to true if it is the end of a
  171. // utterance or file.
  172. int RunVad(const float* const data,
  173. const int array_length, bool is_end = false);
  174. int RunVad(const int16_t* const data,
  175. const int array_length, bool is_end = false);
  176. int RunVad(const int32_t* const data,
  177. const int array_length, bool is_end = false);
  178. // Applied a fixed gain to the input audio. In case you have a very weak
  179. // microphone, you can use this function to boost input audio level.
  180. void SetAudioGain(const float audio_gain);
  181. // If <apply_frontend> is true, then apply frontend audio processing;
  182. // otherwise turns the audio processing off.
  183. void ApplyFrontend(const bool apply_frontend);
  184. // Returns the required sampling rate, number of channels and bits per sample
  185. // values for the audio data. You should use this information to set up your
  186. // audio capturing interface.
  187. int SampleRate() const;
  188. int NumChannels() const;
  189. int BitsPerSample() const;
  190. ~SnowboyVad();
  191. private:
  192. std::unique_ptr<WaveHeader> wave_header_;
  193. std::unique_ptr<PipelineVad> vad_pipeline_;
  194. };
  195. } // namespace snowboy
  196. #endif // SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_