17.2. STT(Speech To Text) 语音转文本

17.2.1. SpeechRecognition

https://github.com/Uberi/speech_recognition 安装

pip install SpeechRecognition


brew install portaudio
pip install pyaudio


neo@MacBook-Pro-Neo ~ % python3 -m speech_recognition			 查看麦克风列表

import speech_recognition as sr

for index, name in enumerate(sr.Microphone.list_microphone_names()):
    print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name))			


neo@MacBook-Pro-Neo ~/workspace/python/speech % python3 microphone.py
Microphone with name "Built-in Microphone" found for `Microphone(device_index=0)`
Microphone with name "Built-in Output" found for `Microphone(device_index=1)`			


import speech_recognition as sr
print(sr.__version__) # just to print the version not required
r = sr.Recognizer()
mic = sr.Microphone(device_index=1) #my device index is 1, you have to put your device index			


import speech_recognition as sr
print(sr.__version__) # just to print the version not required
r = sr.Recognizer()
my_mic = sr.Microphone(device_index=1) #my device index is 1, you have to put your device index
with my_mic as source:
    print("Say now!!!!")
    r.adjust_for_ambient_noise(source) #reduce noise
    audio = r.listen(source) #take voice input from the microphone
print(r.recognize_google(audio)) #to print voice into text PocketSphinx 文件转文本

PocketSphinx默认仅支持英文识别,中文需要下载语言模型文件,Mandarin 为中文普通话。

brew install swig
brew install pocketsphinx
pip install PocketSphinx			


import speech_recognition as sr

# obtain audio from the file
recognizer = sr.Recognizer()
audioFile = sr.AudioFile(r"english.wav")
with audioFile as source:
    audio = recognizer.record(source)
# recognize speech using Sphinx
    print("Sphinx thinks you said: " + recognizer.recognize_sphinx(audio))
except sr.UnknownValueError:
    print("Sphinx could not understand audio")
except sr.RequestError as e:
    print("Sphinx error; {0}".format(e))



#!/usr/bin/env python3

import speech_recognition as sr


for index, name in enumerate(sr.Microphone.list_microphone_names()):
    print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name))

# obtain audio from the microphone
r = sr.Recognizer()
with sr.Microphone() as source:
    print("Say something!")
    audio = r.listen(source)

# recognize speech using Sphinx
    print("Sphinx thinks you said: " + r.recognize_sphinx(audio))
except sr.UnknownValueError:
    print("Sphinx could not understand audio")
except sr.RequestError as e:
    print("Sphinx error; {0}".format(e))    			 Google Cloud Speech API


import speech_recognition as sr
r = sr.Recognizer()
with sr.Microphone() as source:
    print("Say something!")
    audio = r.listen(source)
    text = r.recognize_google(audio)
    print("You said: " + text)
except sr.UnknownValueError:
    print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
    print("Could not request results from Google Speech Recognition service" + format(e))			


text = r.recognize_google(audio, language='zh-CN', show_all= True)	
text = r.recognize_google(audio_data, language=”es-ES”)	 IBM Speech to Text

使用IBM的服务需要一个云账号 IBM Cloud,如你你没有请先注册一个账号,然后创建 Speech To Text 服务。

测试 Speech to Text 是否正常工作

neo@MacBook-Pro-Neo ~/workspace/python/speech % wget https://watson-developer-cloud.github.io/doc-tutorial-downloads/speech-to-text/audio-file.flac	

neo@MacBook-Pro-Neo ~/workspace/python/speech % curl -X POST -u "apikey:eXuTdDOg_l7Ljp5bV8NpFsswVq58ebf2Kr-K5dpp5SZK" \
--header "Content-Type: audio/flac" \
--data-binary audio-file.flac \

#!/usr/bin/env python3

import speech_recognition as sr
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# obtain path to "english.wav" in the same folder as this script
from os import path
# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "english.wav")
# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "french.aiff")
AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "chinese.flac")

# use the audio file as the audio source
r = sr.Recognizer()
with sr.AudioFile(AUDIO_FILE) as source:
    audio = r.record(source)  # read the entire audio file

    print("IBM Speech to Text thinks you said " + r.recognize_ibm(audio, username="netkiller@msn.com", password="******"))
except sr.UnknownValueError:
    print("IBM Speech to Text could not understand audio")
except sr.RequestError as e:
    print("Could not request results from IBM Speech to Text service; {0}".format(e))			

17.2.2. DeepSpeech


# Install DeepSpeech
pip3 install deepspeech

# Download pre-trained English model files
curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer

# Download example audio files
curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/audio-0.9.3.tar.gz
tar xvf audio-0.9.3.tar.gz

# Transcribe an audio file
deepspeech --model deepspeech-0.9.3-models.pbmm --scorer deepspeech-0.9.3-models.scorer --audio audio/2830-3980-0043.wav