Speech Recognition Part 1: Generate Training Data

Question

You've all heard of neural nets, no? One of the "hello world"s of neural nets is OCR with the MNIST dataset. My idea for speech recognition is to train the neural net with short labeled spectrograms instead of characters. Unfortunately I don't know of a database for doing this, so I have to create my own software for doing so. Any suggestions on how to improve it?

Note: Ultimately this is meant for real-time speech recognition, but when tested I couldn't plot and write spectrograms to file quickly enough for this to happen (as it interferes with how often the CPU can sample audio from the microphone). I would be grateful for any solutions detailing how to improve performance in this area.

main.py:

from record import AudioHandler
import sys
import shutil
import os

if __name__ == '__main__':    
    invalid = 0
    audio = AudioHandler()
    if os.path.isdir(audio.DATA_DIR):
        shutil.rmtree(audio.DATA_DIR) # clean folder
    while not invalid:
        try:
            invalid = audio.listen()
        except KeyboardInterrupt:
            break

    audio.convert_fileblock()
    audio.save_all_audio()
    sys.exit()

record.py:

import pyaudio
import struct
import math
import time
import numpy as np
import os
import glob
import scipy.io.wavfile
from scipy import signal
import matplotlib.pyplot as plt

def get_rms(block):
    try:
        rms = np.sqrt(np.mean(np.square(np.abs(block))))
    except Exception as e:
        print('RMS error: {}'.format(e))

    return rms

class AudioHandler(object):
    def __init__(self):
        self.DATA_DIR = 'raw_data'
        self.RATE = 16000
        self.INPUT_BLOCK_TIME = 0.03 # 30 ms
        self.CHANNELS = 1
        self.INPUT_FRAMES_PER_BLOCK = int(self.RATE * self.INPUT_BLOCK_TIME)
        self.SENTENCE_DELAY = 1.1 # seconds
        self.MAX_SILENT_BLOCKS = math.ceil(self.SENTENCE_DELAY / self.INPUT_BLOCK_TIME)
        self.THRESHOLD = 40 # dB
        self.pa = pyaudio.PyAudio()
        self.stream = self.open_mic_stream()
        self.save_counter = '0'.zfill(8)
        self.silent_blocks = 0
        self.listening = False
        self.audio = []

    def stop(self):
        self.stream.close()

    def find_input_device(self):
        device_index = None
        for i in range( self.pa.get_device_count() ):
            devinfo = self.pa.get_device_info_by_index(i)

            for keyword in ['mic','input']:
                if keyword in devinfo['name'].lower():
                    print('Found an input: Device {} - {}'.format(i, devinfo['name']))
                    device_index = i
                    return device_index

        if device_index == None:
            print('No preferred input found; using default input device.')

        return device_index

    def open_mic_stream(self):
        device_index = self.find_input_device()

        stream = self.pa.open(  format = pyaudio.paInt16,
                                channels = self.CHANNELS,
                                rate = self.RATE,
                                input = True,
                                input_device_index = device_index,
                                frames_per_buffer = self.INPUT_FRAMES_PER_BLOCK)

        return stream

    def save_block(self, snd_block):
        self.audio.append(snd_block)
        flat_block = np.hstack(snd_block)
        if not os.path.isdir(self.DATA_DIR):
            os.makedirs(self.DATA_DIR)
        np.savetxt('{}/block{}.txt'.format(self.DATA_DIR, self.save_counter), flat_block)
        self.save_counter = str(int(self.save_counter) + 1).zfill(8)

    def listen(self):
        try:
            raw_block = self.stream.read(self.INPUT_FRAMES_PER_BLOCK, False)
            snd_block = np.fromstring(raw_block, dtype=np.int16)
        except Exception as e:
            print('Error recording: {}'.format(e))
            return

        amplitude = get_rms(snd_block)
        if amplitude > self.THRESHOLD:
            self.listening = True
            self.silent_blocks = 0 # reset counter
        else:
            self.silent_blocks += 1

        if self.listening:
            self.save_block(snd_block)
        if self.silent_blocks > self.MAX_SILENT_BLOCKS and self.listening:
            # remove last stored silent blocks
            for i in range(int(self.save_counter) - 1, int(self.save_counter) - self.MAX_SILENT_BLOCKS, -1):
                self.audio.pop()
                i = str(i).zfill(8)
                os.remove('{}/block{}.txt'.format(self.DATA_DIR, i))
            self.listening = False
            return True # done speaking

    def save_all_audio(self):
        flat_audio = np.hstack(self.audio)
        scipy.io.wavfile.write('{}/ALL.wav'.format(self.DATA_DIR), self.RATE, flat_audio)
        f, t, Sxx = signal.spectrogram(flat_audio, fs=self.RATE, window='hanning', scaling='spectrum')
        fig = plt.pcolormesh(t, f, 10 * np.log10(1 + Sxx), cmap='gray')
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)
        plt.savefig('{}/spec_all.png'.format(self.DATA_DIR), bbox_inches='tight', pad_inches = 0)

    def convert_fileblock(self):
        block_counter = 0
        for file in glob.glob('{}/*.txt'.format(self.DATA_DIR)):
            block = np.loadtxt(file, dtype=np.int16)
            t0 = time.time()
            scipy.io.wavfile.write('{}/audio{}.wav'.format(self.DATA_DIR, block_counter), self.RATE, block)
            f, t, Sxx = signal.spectrogram(block, fs=self.RATE, window='hanning', scaling='spectrum')
            plt.figure(figsize=(self.INPUT_BLOCK_TIME, self.RATE / 4000), dpi = 100)
            fig = plt.pcolormesh(t, f, 10 * np.log10(1 + Sxx), cmap='gray')
            fig.axes.get_xaxis().set_visible(False)
            fig.axes.get_yaxis().set_visible(False)
            plt.savefig('{}/spec{}.png'.format(self.DATA_DIR, block_counter), bbox_inches='tight', pad_inches = 0)
            plt.close()
            print('Time to process block{}: {}'.format(block_counter, time.time() - t0))
            block_counter += 1

@Peilonrayz Initially that line of code was within the KeyboardInterrupt exception, and lingered afterwards when I restructured the code. — syb0rg, Commented Apr 17, 2017 at 2:04
Just as an FYI, you may find the data set available at voxforge.org handy. You should also be aware that most speech recognition systems work on the cepstrum of the sample rather than its spectrum. — Jules, Commented Apr 17, 2017 at 8:38
@Jules Is there a dataset you know of that has pre-labeled spectrograms or cepstrums? Also, can you point to an article saying why a cepstrum would be preferable to a spectrum? I can't seem to find an article giving a reason. — syb0rg, Commented Apr 17, 2017 at 15:36
Can't look at the code in detail right now, but a general comment: Transforming the data in a spectrogram is simply a transformation of what is already there. I'm not an expert on neural networks, but I guess such networks should be able to learn the required transform implicitly with 1D raw audio segments as input instead of 2D spectrograms. In image recognition networks often have 2D convolutional layers at the input, so I'd guess a audio recognition network could use 1D convolutional layers. There is also something called recurrent neural networks that are often used for sequential data. — MB-F, Commented Apr 21, 2017 at 16:12

MSeifert · Accepted Answer · 2017-04-17 13:46:25Z

I have to admit that I have no idea what the purpose of the code is (never worked with speech recognition) but I know a little bit Python so I can give some advise on the code itself.

Performance

You expressed the desire to optimize the performance of your script. But do you know the first rule about optimization? If not, it is:

Don't!

The second rule is:

Don't... yet!

And the third rule is:

Profile before optimizing.

Let me rephrase this: Without tests and profiling you (and I) will likely optimize the wrong part of the code and introduce a bug while doing so.

I personally like pytest and line_profiler but it doesn't matter what tools you choose. Just make sure you know where the bottlenecks are and that the program still works correctly after optimizing the bottlenecks. Before doing that I wouldn't worry about the performance just yet.

Instead, I'll focus on the structure of the code.

Imports:

struct isn't used as far as I can see so why import it?

Also you import several packages so it wouldn't be a bad idea to sort them. I usually sort them alphabetically and seperate builtin modules from 3rd party modules, for example:

# Builtin modules
import glob
import math
import os
import time

# 3rd party modules
import matplotlib.pyplot as plt
import numpy as np
import pyaudio
import scipy.io.wavfile
from scipy import signal

The reason for the seperation is that it makes it easier to know which dependencies need to be installed to actually run the script.

`try` and `except` in functions

Please have a look at your get_rms function: It's essentially a one-line numpy code that is guarded by a try and except.

First, it's great that you only catch Exception instead of having a bare except! But in case you ever get an exception you'll hit an NameError in the return rms line, just because the try didn't work and the rms variable wasn't defined.

Generally, just let the exception bubble up and let the caller figure out what to do when an exception happens. It's a helper function after all. So I would suggest just using:

def get_rms(block):
    return np.sqrt(np.mean(np.square(block)))

I also removed the np.abs because np.square does return an absolute value itself, so the np.abs would be a no-op.

A bit of trivia: If you operate on scalars you can also use math-functions. np.mean returns a scalar, so you could use math.sqrt instead of np.sqrt. I mention this purely as trivia because it's unlikely that the overhead of np.sqrt will contribute significantly to the runtime of your program! But if you ever need to micro-optimize remember that numpy is amazingly fast for arrays but math is faster for scalars.

Class attributes vs. instance attributes

You have a lot of constants in __init__ and it's important to initialize mutable attributes in __init__ (instead of as class attributes) but for immutable attributes it's generally easier to initialize them on the class. That's a matter of preference so this is just a suggestion:

class AudioHandler(object):

    DATA_DIR = 'raw_data'
    RATE = 16000
    INPUT_BLOCK_TIME = 0.03  # 30 ms
    CHANNELS = 1
    INPUT_FRAMES_PER_BLOCK = int(RATE * INPUT_BLOCK_TIME)
    SENTENCE_DELAY = 1.1  # seconds
    MAX_SILENT_BLOCKS = math.ceil(SENTENCE_DELAY / INPUT_BLOCK_TIME)
    THRESHOLD = 40  # dB

    def __init__(self):
        self.pa = pyaudio.PyAudio()
        self.stream = self.open_mic_stream()
        self.save_counter = '0'.zfill(8)
        self.silent_blocks = 0
        self.listening = False
        self.audio = []

One further comment. If you use a context manager approach you might want to put the self.stream = self.open_mic_stream() line in the __enter__ function instead of the __init__!

`str.zfill` or `str.format`

I'm talking about the save_counter attribute. You initialize it as str but you regularly operate on it as int. With str.format you can easily modify the string-representation of an integer, so I would keep the save_counter as int and then format it when you need a string. For example:

>>> '{:0>8}'.format(10001)
'00010001'

This requires some changes throughout your code. For example in save_block you would need to change the last two lines to:

np.savetxt('{}/block{:0>8}.txt'.format(self.DATA_DIR, self.save_counter), flat_block)
self.save_counter += 1

But I feel this is more readable (and probably also a tiny bit faster).

`return` in loops

In find_input_device you can easily get rid of the device variable because either you return inside the loop or the loop terminates without any match:

def find_input_device(self):
    for i in range(self.pa.get_device_count()):
        devinfo = self.pa.get_device_info_by_index(i)

        for keyword in ['mic','input']:
            if keyword in devinfo['name'].lower():
                print('Found an input: Device {} - {}'.format(i, devinfo['name']))
                return i

    print('No preferred input found; using default input device.')
    # return None is implicit!

`enumerate`

In your convert_fileblock method I saw this:

block_counter = 0
for file in glob.glob('{}/*.txt'.format(self.DATA_DIR)):
    # ...
    block_counter += 1

There's a builtin iterator enumerate for this exactly this task:

for block_counter, file in enumerate(glob.glob('{}/*.txt'.format(self.DATA_DIR))):
    # ...

One small issue here: file isn't a good variable name. It's (1) the name of a python2 builtin function and (2) filename would be more descriptive anyway.

`print` vs. `logging`

In most cases you never want to print. Depending on your preferences you should either log, warn or raise. For example in find_input_device the device message is not always important (except when debugging or when you want a verbose output). Instead you could use logging:

logging.debug('Found an input: Device {} - {}'.format(i, devinfo['name']))

By default this message is hidden - except when you do something like:

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

before you run your script.

PEP8 and function calls

This:

    stream = self.pa.open(  format = pyaudio.paInt16,
                            channels = self.CHANNELS,
                            rate = self.RATE,
                            input = True,
                            input_device_index = device_index,
                            frames_per_buffer = self.INPUT_FRAMES_PER_BLOCK)

is breaking two PEP8 "rules":

No whitespaces after (
No whitespaces around = for keyword arguments

so this would look better as:

    stream = self.pa.open(format=pyaudio.paInt16,
                          channels=self.CHANNELS,
                          rate=self.RATE,
                          input=True,
                          input_device_index=device_index,
                          frames_per_buffer=self.INPUT_FRAMES_PER_BLOCK)

and if you care about line-length you can also indent slightly differently:

    stream = self.pa.open(
            format=pyaudio.paInt16,
            channels=self.CHANNELS,
            rate=self.RATE,
            input=True,
            input_device_index=device_index,
            frames_per_buffer=self.INPUT_FRAMES_PER_BLOCK)

LBYL and EAFP

The if not os.path.isdir(self.DATA_DIR) in the save_block method is a red flag. It might work most of the time, but what if the directory was created shortly after this call and before the os.makedirs call?

Generally you should just try to create it and catch an exception if it already exists. In newer pythons makedirs supports an exist_okay parameter that simplifies this without using try, except OSError:

os.makedirs(self.DATA_DIR, exist_ok=True)

@Peilonrayz already mentioned this already so I won't go into the details of LBYL and EAFP. But generally EAFP is faster and more robust (and often also shorter). No wonder that it's the recommended style in python :-)

DRY

The methods save_all_audio and convert_fileblock share (at first glance) some common code:

scipy.io.wavfile.write(...)
f, t, Sxx = signal.spectrogram(...)
fig = plt.pcolormesh(...)
fig.axes.get_xaxis().set_visible(False)
fig.axes.get_yaxis().set_visible(False)
plt.savefig(...)

Probably this could be factored into a seperate (private) method that is called by both methods.

More context managers

You timed a part of your script using t0 = time.time(), print('Time to process block{}: {}'.format(block_counter, time.time() - t0)). I would suggest creating a context manager for this task because that way you can see easily what is being timed (because it is indented) and you could reuse it whenever you want to time something else:

import time

class Timer():
    def __enter__(self):
        self._starttime = time.time()

    def __exit__(self, *args, **kwargs):
        time_elapsed = time.time() - self._starttime
        print(time_elapsed)  # or "logging.debug", see above

An example:

with Timer():
    time.sleep(2)

Be warned that this context manager class adds some overhead but time.time isn't really suitable for "precise" perfomance analysis anyway. If you want to accuratly time some code the timeit module should be used instead.

More comments!

Graipher · Accepted Answer · 2017-04-17 09:47:11Z

PyAudio doesn't seem to have a with interface, however you should add that interface to your class. PEP 343 explains why it was added to Python, but it mostly comes down to being syntactic sugar to a try finally. To add this is rather simple, you add __enter__ and __exit__ and that's it. The benefit to this, is you don't have to manually close the stream, as you don't in main.py. And so I'd change your code to include:

class AudioHandler(object):
    def __init__(self):
        ...
        self.pa = pyaudio.PyAudio()
        self.stream = None
        self.save_counter = '0'.zfill(8)
        ...

    def __enter__(self):
        self.open()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
        return False

    def close(self):
        self.stream.close()

    def open(self):
        device_index = self.find_input_device()

        self.stream = self.pa.open(format=pyaudio.paInt16,
                                   channels=self.CHANNELS,
                                   rate=self.RATE,
                                   input=True,
                                   input_device_index=device_index,
                                   frames_per_buffer=self.INPUT_FRAMES_PER_BLOCK)

I don't really know the libraries you're using and so I can't really comment on them, but I'd encourage a more EAFP rather than LBYL approach to file manipulation, so that you don't get TOCTOU bugs.

I'd also not use sys.exit, and just leave Python to exit normally. I know this is a relic of earlier code, but I'd definitely recommend you remove it, as it normally is a red light.

And so all the above would change main.py roughly to (I've not tested it):

from record import AudioHandler
import shutil

if __name__ == '__main__':    
    invalid = 0
    with AudioHandler() as audio:
        shutil.rmtree(audio.DATA_DIR, ignore_errors=True)
        while not invalid:
            try:
                invalid = audio.listen()
            except KeyboardInterrupt:
                break

        audio.convert_fileblock()
        audio.save_all_audio()

What is the __close__ method for? Did you mean __exit__? — MSeifert, Commented Apr 17, 2017 at 9:21
@MSeifert Edited it in, in the text he correctly said you need to define __exit__, so I guess it was just a typo (so many closes around there...). — Graipher, Commented Apr 17, 2017 at 9:48
Although you don't really need the open and close methods, you could move all the code there to the __enter__ and __exit__methods — Graipher, Commented Apr 17, 2017 at 12:32
@Graipher I personally would keep them in. If you want to extend the class, then it's simpler and cleaner to use open and close rather than the Python internal dependent __enter__ and __exit__. Also it wouldn't work without with, so like 2.5+ or something. — Peilonrayz, Commented Apr 17, 2017 at 12:34

Stack Exchange Network

Speech Recognition Part 1: Generate Training Data

2 Answers 2

Performance

Imports:

`try` and `except` in functions

Class attributes vs. instance attributes

`str.zfill` or `str.format`

`return` in loops

`enumerate`

`print` vs. `logging`

PEP8 and function calls

LBYL and EAFP

DRY

More context managers

More comments!

Not the answer you're looking for? Browse other questions tagged
python
performance
file
audio
data-visualization
or ask your own question.

Linked

Hot Network Questions

2 Answers 2

Performance

Imports:

try and except in functions

Class attributes vs. instance attributes

str.zfill or str.format

return in loops

enumerate

print vs. logging

PEP8 and function calls

LBYL and EAFP

More context managers

More comments!

Not the answer you're looking for? Browse other questions tagged pythonperformancefileaudiodata-visualization or ask your own question.

Linked

Related

`try` and `except` in functions

`str.zfill` or `str.format`

`return` in loops

`enumerate`

`print` vs. `logging`

Not the answer you're looking for? Browse other questions tagged
python
performance
file
audio
data-visualization
or ask your own question.