Now that I have generated training data, I need to classify each example with a label to train a TensorFlow neural net (first building a suitable dataset). To streamline the process, I wrote this little Python script to help me. Any suggestions for improvement?
classify.py:
# Builtin modules
import glob
import sys
import os
import shutil
import wave
import time
import re
from threading import Thread
# 3rd party modules
import scipy.io.wavfile
import pyaudio
DATA_DIR = 'raw_data'
LABELED_DIR = 'labeled_data'
answer = None
def classify_files():
global answer
# instantiate PyAudio
p = pyaudio.PyAudio()
for filename in glob.glob('{}/*.wav'.format(DATA_DIR)):
# define stream chunk
chunk = 1024
#open a wav format music
wf = wave.open(filename, 'rb')
#open stream
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
#read data
data = wf.readframes(chunk)
#play stream
while answer is None:
stream.write(data)
data = wf.readframes(chunk)
if data == b'': # if file is over then rewind
wf.rewind()
time.sleep(1)
data = wf.readframes(chunk)
# don't know how to classify, skip sample
if answer == '.':
answer = None
continue
# sort spectogram based on input
spec_filename = 'spec{}.jpeg'.format(str(re.findall(r'\d+', filename)[0]))
os.makedirs('{}/{}'.format(LABELED_DIR, answer), exist_ok=True)
shutil.copyfile('{}/{}'.format(DATA_DIR, spec_filename), '{}/{}/{}'.format(LABELED_DIR, answer, spec_filename))
# reset answer field
answer = None
#stop stream
stream.stop_stream()
stream.close()
#close PyAudio
p.terminate()
if __name__ == '__main__':
try:
# exclude file from glob
os.remove('{}/ALL.wav'.format(DATA_DIR))
num_files = len(glob.glob('{}/*.wav'.format(DATA_DIR)))
Thread(target = classify_files).start()
for i in range(0, num_files):
answer = input("Enter letter of sound heard: ")
except KeyboardInterrupt:
sys.exit()