123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- // include/snowboy-detect.h
- // Copyright 2016 KITT.AI (author: Guoguo Chen)
- #ifndef SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_
- #define SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_
- #include <memory>
- #include <string>
- namespace snowboy {
- // Forward declaration.
- struct WaveHeader;
- class PipelineDetect;
- class PipelineVad;
- ////////////////////////////////////////////////////////////////////////////////
- //
- // SnowboyDetect class interface.
- //
- ////////////////////////////////////////////////////////////////////////////////
- class SnowboyDetect {
- public:
- // Constructor that takes a resource file, and a list of hotword models which
- // are separated by comma. In the case that more than one hotword exist in the
- // provided models, RunDetection() will return the index of the hotword, if
- // the corresponding hotword is triggered.
- //
- // CAVEAT: a personal model only contain one hotword, but an universal model
- // may contain multiple hotwords. It is your responsibility to figure
- // out the index of the hotword. For example, if your model string is
- // "foo.pmdl,bar.umdl", where foo.pmdl contains hotword x, bar.umdl
- // has two hotwords y and z, the indices of different hotwords are as
- // follows:
- // x 1
- // y 2
- // z 3
- //
- // @param [in] resource_filename Filename of resource file.
- // @param [in] model_str A string of multiple hotword models,
- // separated by comma.
- SnowboyDetect(const std::string& resource_filename,
- const std::string& model_str);
- // Resets the detection. This class handles voice activity detection (VAD)
- // internally. But if you have an external VAD, you should call Reset()
- // whenever you see segment end from your VAD.
- bool Reset();
- // Runs hotword detection. Supported audio format is WAVE (with linear PCM,
- // 8-bits unsigned integer, 16-bits signed integer or 32-bits signed integer).
- // See SampleRate(), NumChannels() and BitsPerSample() for the required
- // sampling rate, number of channels and bits per sample values. You are
- // supposed to provide a small chunk of data (e.g., 0.1 second) each time you
- // call RunDetection(). Larger chunk usually leads to longer delay, but less
- // CPU usage.
- //
- // Definition of return values:
- // -2: Silence.
- // -1: Error.
- // 0: No event.
- // 1: Hotword 1 triggered.
- // 2: Hotword 2 triggered.
- // ...
- //
- // @param [in] data Small chunk of data to be detected. See
- // above for the supported data format.
- // @param [in] is_end Set it to true if it is the end of a
- // utterance or file.
- int RunDetection(const std::string& data, bool is_end = false);
- // Various versions of RunDetection() that take different format of audio. If
- // NumChannels() > 1, e.g., NumChannels() == 2, then the array is as follows:
- //
- // d1c1, d1c2, d2c1, d2c2, d3c1, d3c2, ..., dNc1, dNc2
- //
- // where d1c1 means data point 1 of channel 1.
- //
- // @param [in] data Small chunk of data to be detected. See
- // above for the supported data format.
- // @param [in] array_length Length of the data array.
- // @param [in] is_end Set it to true if it is the end of a
- // utterance or file.
- int RunDetection(const float* const data,
- const int array_length, bool is_end = false);
- int RunDetection(const int16_t* const data,
- const int array_length, bool is_end = false);
- int RunDetection(const int32_t* const data,
- const int array_length, bool is_end = false);
- // Sets the sensitivity string for the loaded hotwords. A <sensitivity_str> is
- // a list of floating numbers between 0 and 1, and separated by comma. For
- // example, if there are 3 loaded hotwords, your string should looks something
- // like this:
- // 0.4,0.5,0.8
- // Make sure you properly align the sensitivity value to the corresponding
- // hotword.
- void SetSensitivity(const std::string& sensitivity_str);
- // Similar to the sensitivity setting above. When set higher than the above
- // sensitivity, the algorithm automatically chooses between the normal
- // sensitivity set above and the higher sensitivity set here, to maximize the
- // performance. By default, it is not set, which means the algorithm will
- // stick with the sensitivity set above.
- void SetHighSensitivity(const std::string& high_sensitivity_str);
- // Returns the sensitivity string for the current hotwords.
- std::string GetSensitivity() const;
- // Applied a fixed gain to the input audio. In case you have a very weak
- // microphone, you can use this function to boost input audio level.
- void SetAudioGain(const float audio_gain);
- // Writes the models to the model filenames specified in <model_str> in the
- // constructor. This overwrites the original model with the latest parameter
- // setting. You are supposed to call this function if you have updated the
- // hotword sensitivities through SetSensitivity(), and you would like to store
- // those values in the model as the default value.
- void UpdateModel() const;
- // Returns the number of the loaded hotwords. This helps you to figure the
- // index of the hotwords.
- int NumHotwords() const;
- // If <apply_frontend> is true, then apply frontend audio processing;
- // otherwise turns the audio processing off. Frontend audio processing
- // includes algorithms such as automatic gain control (AGC), noise suppression
- // (NS) and so on. Generally adding frontend audio processing helps the
- // performance, but if the model is not trained with frontend audio
- // processing, it may decrease the performance. The general rule of thumb is:
- // 1. For personal models, set it to false.
- // 2. For universal models, follow the instruction of each published model
- void ApplyFrontend(const bool apply_frontend);
- // Returns the required sampling rate, number of channels and bits per sample
- // values for the audio data. You should use this information to set up your
- // audio capturing interface.
- int SampleRate() const;
- int NumChannels() const;
- int BitsPerSample() const;
- ~SnowboyDetect();
- private:
- std::unique_ptr<WaveHeader> wave_header_;
- std::unique_ptr<PipelineDetect> detect_pipeline_;
- };
- ////////////////////////////////////////////////////////////////////////////////
- //
- // SnowboyVad class interface.
- //
- ////////////////////////////////////////////////////////////////////////////////
- class SnowboyVad {
- public:
- // Constructor that takes a resource file. It shares the same resource file
- // with SnowboyDetect.
- SnowboyVad(const std::string& resource_filename);
- // Resets the VAD.
- bool Reset();
- // Runs the VAD algorithm. Supported audio format is WAVE (with linear PCM,
- // 8-bits unsigned integer, 16-bits signed integer or 32-bits signed integer).
- // See SampleRate(), NumChannels() and BitsPerSample() for the required
- // sampling rate, number of channels and bits per sample values. You are
- // supposed to provide a small chunk of data (e.g., 0.1 second) each time you
- // call RunDetection(). Larger chunk usually leads to longer delay, but less
- // CPU usage.
- //
- // Definition of return values:
- // -2: Silence.
- // -1: Error.
- // 0: Non-silence.
- //
- // @param [in] data Small chunk of data to be detected. See
- // above for the supported data format.
- // @param [in] is_end Set it to true if it is the end of a
- // utterance or file.
- int RunVad(const std::string& data, bool is_end = false);
- // Various versions of RunVad() that take different format of audio. If
- // NumChannels() > 1, e.g., NumChannels() == 2, then the array is as follows:
- //
- // d1c1, d1c2, d2c1, d2c2, d3c1, d3c2, ..., dNc1, dNc2
- //
- // where d1c1 means data point 1 of channel 1.
- //
- // @param [in] data Small chunk of data to be detected. See
- // above for the supported data format.
- // @param [in] array_length Length of the data array.
- // @param [in] is_end Set it to true if it is the end of a
- // utterance or file.
- int RunVad(const float* const data,
- const int array_length, bool is_end = false);
- int RunVad(const int16_t* const data,
- const int array_length, bool is_end = false);
- int RunVad(const int32_t* const data,
- const int array_length, bool is_end = false);
- // Applied a fixed gain to the input audio. In case you have a very weak
- // microphone, you can use this function to boost input audio level.
- void SetAudioGain(const float audio_gain);
- // If <apply_frontend> is true, then apply frontend audio processing;
- // otherwise turns the audio processing off.
- void ApplyFrontend(const bool apply_frontend);
- // Returns the required sampling rate, number of channels and bits per sample
- // values for the audio data. You should use this information to set up your
- // audio capturing interface.
- int SampleRate() const;
- int NumChannels() const;
- int BitsPerSample() const;
- ~SnowboyVad();
- private:
- std::unique_ptr<WaveHeader> wave_header_;
- std::unique_ptr<PipelineVad> vad_pipeline_;
- };
- } // namespace snowboy
- #endif // SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_
|