diff --git a/README.md b/README.md index 74cbb1a..2202a50 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,24 @@ register a project with access to the "Cloud Speech API." See Google's speech-to-text demo site for more information: https://cloud.google.com/speech-to-text/ +### Deepspeech support (experimental) +vim-speech can utilize the Mozilla [deepspeech](https://github.com/mozilla/DeepSpeech) + +You need to install deepspeech: +``` +pip install deepspeech +``` + +And then download/train deepspeech model (and optionally the language model scorer). Then +let this plugin know about their location e.g. +``` +export DEEPSPEECH_MODEL= +export DEEPSPEECH_SCORER= +``` + +It may be helpful to finetune the pre-trained model with your own voice samples. +More info in [documentation](https://deepspeech.readthedocs.io/) + ## Usage Once you have figured out how to get everything installed, you can use the diff --git a/autoload/vim_speech.vim b/autoload/vim_speech.vim index d103116..6bd42ac 100644 --- a/autoload/vim_speech.vim +++ b/autoload/vim_speech.vim @@ -80,8 +80,8 @@ function! s:StartJobIfNeeded(buffer) abort return endif - if empty($GOOGLE_APPLICATION_CREDENTIALS) - throw 'GOOGLE_APPLICATION_CREDENTIALS is not set' + if empty($GOOGLE_APPLICATION_CREDENTIALS) && empty($DEEPSPEECH_MODEL) + throw 'Neither GOOGLE_APPLICATION_CREDENTIALS nor DEEPSPEECH_MODEL is set' endif let l:command = ale#Escape(g:vim_speech_dir . '/venv/bin/python') diff --git a/plugin/speech_to_text_client.py b/plugin/speech_to_text_client.py index 13a4bf7..4098af1 100755 --- a/plugin/speech_to_text_client.py +++ b/plugin/speech_to_text_client.py @@ -92,6 +92,21 @@ def stop_recording(self): return output_file.getvalue() +def transcribe_file_with_deepspeech(content): + from deepspeech import Model + import numpy as np + + if not content: + return '' + + ds = Model(os.environ.get('DEEPSPEECH_MODEL')) + scorer = os.environ.get('DEEPSPEECH_SCORER') + if scorer: + ds.enableExternalScorer(scorer) + numpy_content = np.frombuffer(content, dtype=np.int16) + transcribe = ds.stt(numpy_content) + return transcribe + def transcribe_file(content): from google.cloud import speech @@ -127,11 +142,13 @@ def stdin_has_data(): def main(): + # Stop early if the environment variable isn't set. - if not os.environ.get('GOOGLE_APPLICATION_CREDENTIALS'): + if not os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') and not os.environ.get('DEEPSPEECH_MODEL'): sys.exit( 'You must set GOOGLE_APPLICATION_CREDENTIALS' - ' to your JSON credentials filename.' + 'to your JSON credentials filename or DEEPSPEECH_MODEL' + 'to trained deepspeech model.' ) client = RecordingClient() @@ -156,7 +173,7 @@ def main(): elif message == 'stop': print_and_flush('record end') audio_content = client.stop_recording() - print_and_flush('speech', transcribe_file(audio_content)) + print_and_flush('speech',transcribe_file(audio_content) if os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') else transcribe_file_with_deepspeech(audio_content)) elif message == 'quit': break