You've all heard of neural nets, no? One of the "hello world"s of neural nets is OCR with the MNIST dataset. My idea for speech recognition is to train the neural net with short labeled spectrograms instead of characters. Unfortunately I don't know of a database for doing this, so I have to create my own software for doing so. Any suggestions on how to improve it?
Note: Ultimately this is meant for real-time speech recognition, but when tested I couldn't plot and write spectrograms to file quickly enough for this to happen (as it interferes with how often the CPU can sample audio from the microphone). I would be grateful for any solutions detailing how to improve performance in this area.
main.py:
from record import AudioHandler
import sys
import shutil
import os
if __name__ == '__main__':
invalid = 0
audio = AudioHandler()
if os.path.isdir(audio.DATA_DIR):
shutil.rmtree(audio.DATA_DIR) # clean folder
while not invalid:
try:
invalid = audio.listen()
except KeyboardInterrupt:
break
audio.convert_fileblock()
audio.save_all_audio()
sys.exit()
record.py:
import pyaudio
import struct
import math
import time
import numpy as np
import os
import glob
import scipy.io.wavfile
from scipy import signal
import matplotlib.pyplot as plt
def get_rms(block):
try:
rms = np.sqrt(np.mean(np.square(np.abs(block))))
except Exception as e:
print('RMS error: {}'.format(e))
return rms
class AudioHandler(object):
def __init__(self):
self.DATA_DIR = 'raw_data'
self.RATE = 16000
self.INPUT_BLOCK_TIME = 0.03 # 30 ms
self.CHANNELS = 1
self.INPUT_FRAMES_PER_BLOCK = int(self.RATE * self.INPUT_BLOCK_TIME)
self.SENTENCE_DELAY = 1.1 # seconds
self.MAX_SILENT_BLOCKS = math.ceil(self.SENTENCE_DELAY / self.INPUT_BLOCK_TIME)
self.THRESHOLD = 40 # dB
self.pa = pyaudio.PyAudio()
self.stream = self.open_mic_stream()
self.save_counter = '0'.zfill(8)
self.silent_blocks = 0
self.listening = False
self.audio = []
def stop(self):
self.stream.close()
def find_input_device(self):
device_index = None
for i in range( self.pa.get_device_count() ):
devinfo = self.pa.get_device_info_by_index(i)
for keyword in ['mic','input']:
if keyword in devinfo['name'].lower():
print('Found an input: Device {} - {}'.format(i, devinfo['name']))
device_index = i
return device_index
if device_index == None:
print('No preferred input found; using default input device.')
return device_index
def open_mic_stream(self):
device_index = self.find_input_device()
stream = self.pa.open( format = pyaudio.paInt16,
channels = self.CHANNELS,
rate = self.RATE,
input = True,
input_device_index = device_index,
frames_per_buffer = self.INPUT_FRAMES_PER_BLOCK)
return stream
def save_block(self, snd_block):
self.audio.append(snd_block)
flat_block = np.hstack(snd_block)
if not os.path.isdir(self.DATA_DIR):
os.makedirs(self.DATA_DIR)
np.savetxt('{}/block{}.txt'.format(self.DATA_DIR, self.save_counter), flat_block)
self.save_counter = str(int(self.save_counter) + 1).zfill(8)
def listen(self):
try:
raw_block = self.stream.read(self.INPUT_FRAMES_PER_BLOCK, False)
snd_block = np.fromstring(raw_block, dtype=np.int16)
except Exception as e:
print('Error recording: {}'.format(e))
return
amplitude = get_rms(snd_block)
if amplitude > self.THRESHOLD:
self.listening = True
self.silent_blocks = 0 # reset counter
else:
self.silent_blocks += 1
if self.listening:
self.save_block(snd_block)
if self.silent_blocks > self.MAX_SILENT_BLOCKS and self.listening:
# remove last stored silent blocks
for i in range(int(self.save_counter) - 1, int(self.save_counter) - self.MAX_SILENT_BLOCKS, -1):
self.audio.pop()
i = str(i).zfill(8)
os.remove('{}/block{}.txt'.format(self.DATA_DIR, i))
self.listening = False
return True # done speaking
def save_all_audio(self):
flat_audio = np.hstack(self.audio)
scipy.io.wavfile.write('{}/ALL.wav'.format(self.DATA_DIR), self.RATE, flat_audio)
f, t, Sxx = signal.spectrogram(flat_audio, fs=self.RATE, window='hanning', scaling='spectrum')
fig = plt.pcolormesh(t, f, 10 * np.log10(1 + Sxx), cmap='gray')
fig.axes.get_xaxis().set_visible(False)
fig.axes.get_yaxis().set_visible(False)
plt.savefig('{}/spec_all.png'.format(self.DATA_DIR), bbox_inches='tight', pad_inches = 0)
def convert_fileblock(self):
block_counter = 0
for file in glob.glob('{}/*.txt'.format(self.DATA_DIR)):
block = np.loadtxt(file, dtype=np.int16)
t0 = time.time()
scipy.io.wavfile.write('{}/audio{}.wav'.format(self.DATA_DIR, block_counter), self.RATE, block)
f, t, Sxx = signal.spectrogram(block, fs=self.RATE, window='hanning', scaling='spectrum')
plt.figure(figsize=(self.INPUT_BLOCK_TIME, self.RATE / 4000), dpi = 100)
fig = plt.pcolormesh(t, f, 10 * np.log10(1 + Sxx), cmap='gray')
fig.axes.get_xaxis().set_visible(False)
fig.axes.get_yaxis().set_visible(False)
plt.savefig('{}/spec{}.png'.format(self.DATA_DIR, block_counter), bbox_inches='tight', pad_inches = 0)
plt.close()
print('Time to process block{}: {}'.format(block_counter, time.time() - t0))
block_counter += 1
sys.exit
? \$\endgroup\$KeyboardInterrupt
exception, and lingered afterwards when I restructured the code. \$\endgroup\$