TensorFlow Speech Recognition Challenge

import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd

# Math
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
# pip install librosa==0.62
# pip install numba==0.48 # 만약 설치가 안된다면, 관리자 권한으로 설치!
import librosa

from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa.display

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd

%matplotlib inline

1. Visualization of recordings - input features

인간의 청각에는 두가지 이론이 있다. 장소와 시간은 음성인식에서 중요한 영향으로 본다. 스펙트럼 분석(주파수)입력 및 보다 정교한 feature MFCC(Mel Frequencey Cepstral Coefficients)

1.1. Wave and spectrogram

# 몇개 파일을 불러온다.
train_audio_path = '../data/tensorflow-speech-recognition-challenge/train/audio/'
filename = '/yes/0a7c2a8d_nohash_0.wav'
sample_rate, samples = wavfile.read(str(train_audio_path) + filename)

'''
spectrogram을 계산하는 함수 정의한다. 
참고로, 우리는 spectrogram 값의 로그를 취하고 있다. 
그것은 우리의 plot을 훨씬 더 명확하게 할 것이고, 더욱이 그것은 사람들이 듣는 방식과 엄격히 연결되어 있다. 
로그에 대한 입력으로 0 값이 없다는 보장이 필요하다.
'''

def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate /1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                           fs=sample_rate,
                                           window='hann',
                                           nperseg=nperseg,
                                           noverlap=noverlap,
                                           detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

Nyquist theorem에 따르면, 주파수(Frequencies) 범위는 0~8000 정도 된다.

freqs, times, spectrogram = log_specgram(samples, sample_rate)

fig = plt.figure(figsize=(14,8))
ax1 = fig.add_subplot(211)
ax1.set_title=('Raw wave of '+ filename)
ax1.set_ylabel('Ampliude')
ax1.plot(np.linspace(0, sample_rate/len(samples), sample_rate), samples)

ax2 = fig.add_subplot(212)
ax2.imshow(spectrogram.T, aspect='auto', origin='lower',
          extent =[times.min(), times.max(), freqs.min(), freqs.max()])
ax2.set_yticks(freqs[::16])
ax2.set_xticks(times[::16])
ax2.set_title('Spectrogram of '+filename)
ax2.set_ylabel('Freqs in Hz')
ax2.set_xlabel('seconds')

Text(0.5, 0, 'seconds')

만약 NN을 입력변수로써 spectrogram에 사용한다면 우리는 변수를 정규화하는 것을 기억해야한다.

mean = np.mean(spectrogram, axis=0)
std = np.std(spectrogram, axis=0)
spectrogram = (spectrogram - mean) / std

# mean = np.mean(spectrogram, axis=0)
# std = np.std(spectrogram, axis=0)
# spectrogram = (spectrogram-mean)/std

1.2. MFCC

MFCC에 대한 자세한 내용을 알고 싶다면 이 튜토리얼을 보면된다. MFCC는 인간의 청력 특성을 모방할 준비가 잘 되어 있다는 것을 알 수 있다고 설명했다. librosa python package를 사용하여 Mel power spectrogram과 MFCC를 계산할 수 있다.

librosa_samples, librosa_sample_rate = librosa.load(str(train_audio_path)+filename)
S = librosa.feature.melspectrogram(librosa_samples, sr=librosa_sample_rate, n_mels=128, fmax=8000)

# log scale 전환(dB). 우리는 peak power(max)만을 사용할 것이다.
log_S = librosa.power_to_db(S, ref=np.max)

plt.figure(figsize=(12,4))
librosa.display.specshow(log_S, sr=sample_rate, x_axis='time', y_axis='mel')
plt.title('Mel Power spectrogram')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()

c:\users\hanbit\appdata\local\programs\python\python37\lib\site-packages\librosa\display.py:797: MatplotlibDeprecationWarning:

The 'basey' parameter of __init__() has been renamed 'base' since Matplotlib 3.3; support for the old name will be dropped two minor releases later.

c:\users\hanbit\appdata\local\programs\python\python37\lib\site-packages\librosa\display.py:797: MatplotlibDeprecationWarning:

The 'linthreshy' parameter of __init__() has been renamed 'linthresh' since Matplotlib 3.3; support for the old name will be dropped two minor releases later.

mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)

delta2_mfcc =  librosa.feature.delta(mfcc, order=2)

plt.figure(figsize=(12,4))
librosa.display.specshow(delta2_mfcc)
plt.ylabel('MFCC coeffs')
plt.xlabel('Time')
plt.title('MFCC')
plt.colorbar()
plt.tight_layout()

1.3. Sprectrogram in 3d

data = [go.Surface(x=times, y=freqs, z=spectrogram.T)]
layout = go.Layout(
title='Specgtrogram of "yes" in 3d',
scene = dict(
yaxis = dict(title='Frequncy', range=[freqs.min(),freqs.max()]),
xaxis = dict(title='Time', range=[times.min(),times.max()],),
zaxis = dict(title='Log amplitude'),
),
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

1.4. Silence removal

slience 구간 지우기

ipd.Audio(samples, rate=sample_rate)

나는 일부 VAD가 여기서 정말 유용할 것이라고 생각한다. 비록 말은 짧지만 그 속에는 침묵이 많다. 괜찮은 VAD는 훈련 크기를 많이 줄여 훈련 속도를 크게 높일 수 있다. 처음부터 끝까지 파일의 일부를 잘라내고 다시 들어보자(위의 줄거리를 바탕으로 4000에서 13000까지 가져간다).

samples_cut = samples[4000:13000]
ipd.Audio(samples_cut, rate=sample_rate)

우리는 전체 단어를 들을 수 있다는 것에 동의할 수 있다. 모든 파일을 수동으로 자르거나 간단한 플롯을 기준으로 하는 것은 불가능하다. 하지만 좋은 VAD를 갖기 위해 webrtcvad 패키지를 예로 들 수 있다.

추측된 'y' 'e' 's' 그래프의 정렬과 함께 다시 한 번 그려보자.

freqs, times, spectrogram_cut = log_specgram(samples_cut, sample_rate)

fig = plt.figure(figsize=(14,8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of ' + filename)
ax1.set_ylabel('Amplitude')
ax1.plot(samples_cut)

ax2 = fig.add_subplot(212)
ax2.set_title('Spectrogram of ' + filename)
ax2.set_ylabel('Frequencies * 0.1')
ax2.set_xlabel('Samples')
ax2.imshow(spectrogram_cut.T, aspect='auto', origin='lower',
          extent=[times.min(), times.max(), freqs.min(), freqs.max()])

ax2.set_yticks(freqs[::16])
ax2.set_xticks(times[::16])
ax2.text(0.06, 1000, 'Y', fontsize=18)
ax2.text(0.17, 1000, 'E', fontsize=18)
ax2.text(0.36, 1000, 'S', fontsize=18)

xcoords =[0.025, 0.11, 0.23, 0.49]
for xc in xcoords:
#     ax1.axvline(x=xc*16000, c='r')
#     ax2.axvline(x=xc, c='r')
    ax1.axvline(x=xc*samples_cut.shape[0], c='r')
    ax2.axvline(x=(xc*samples_cut.shape[0])/sample_rate, c='r')

1.5. Resampling - dimensionality reductions

데이터의 치수성을 줄이는 또 다른 방법은 기록을 다시 샘플링하는 것이다.

녹음은 16k 주파수로 샘플링되기 때문에 자연스럽지 않다는 것을 들을 수 있고, 우리는 보통 훨씬 더 많이 듣는다. 그러나 가장 많은 음성 관련 주파수는 더 작은 대역으로 제시된다. 그렇기 때문에 당신은 GSM 신호가 8000Hz로 샘플링되는 전화와 통화하는 다른 사람을 여전히 이해할 수 있다.

요약하면 데이터 집합을 8k로 다시 샘플링할 수 있다. 우리는 중요하지 않아야 할 정보를 버리고 데이터의 크기를 줄일 것이다.

우리는 이것이 위험할 수 있다는 것을 기억해야 한다. 왜냐하면 이것은 경쟁이기 때문이다. 그리고 때로는 아주 작은 성과 차이가 이기기 때문에 우리는 아무것도 잃고 싶지 않다. 반면에, 첫 번째 실험은 작은 훈련 크기로 훨씬 더 빨리 할 수 있다.

FFT(Fast Fourier Transform)를 계산이 필요하다.

# fft 계산
def custom_fft(y, fs):
    T = 1.0 /fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT는 시뮬레이션을 통해 전반부만 촬영한다.
    # FFT도 복잡해서 진짜 부분만 가져가면 된다. (abs)
#     vals = 2.0/N * np.abs(yf[0:N//2])
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

'''
녹음된 내용을 읽고 다시 샘플링한 후 들어보자.
또한 FFT, Notice를 비교할 수 있는데, 원래 신호에는 4000Hz 이상의 정보가 거의 없다.
'''

'\n녹음된 내용을 읽고 다시 샘플링한 후 들어보자.\n또한 FFT, Notice를 비교할 수 있는데, 원래 신호에는 4000Hz 이상의 정보가 거의 없다.\n'

filename = '/happy/0b09edd3_nohash_0.wav'
new_sample_rate = 8000
sample_rate, samples = wavfile.read(str(train_audio_path)+filename)
resampled = signal.resample(samples, int(new_sample_rate/sample_rate*samples.shape[0]))

ipd.Audio(samples, rate = sample_rate)

ipd.Audio(resampled, rate = new_sample_rate)

두개를 들어보면 거의 차이가 없다.

xf, vals = custom_fft(samples, sample_rate)
plt.figure(figsize=(12,4))
plt.title('FFT of recording sampled with '+ str(sample_rate) + ' Hz')
plt.plot(xf, vals)
plt.xlabel('Frequency')
plt.grid()
plt.show()

xf, vals = custom_fft(resampled, new_sample_rate)
plt.figure(figsize=(12,4))
plt.title('*Resample data* FFT of recording sampled with '+ str(new_sample_rate) + ' Hz')
plt.plot(xf, vals)
plt.xlabel('Frequency')
plt.grid()
plt.show()

이것이 데이터셋 크기를 두 번 줄인 방법이다!

1.6. Features extraction steps

변수 추출 알고리즘은 아래와 같이 진행한다.

Resampling
VAD (Voice Activity Detection)
신호 길이가 같도록 0으로 padding
Log spectrogram( or MFCC, or PLP)
평균 및 표준 피쳐 정규화
임시 정보를 얻기 위해 지정된 수의 프레임 쌓기

분명히 중요한게 아니지만, 이방법을 사용하면 약간의 왜곡을 발견할 수 있다.

[kaggle][필사] TensorFlow Speech Recognition Challenge (2) (0)	2020.10.09
[kaggle][필사] New York City Taxi Duration (3) (0)	2020.10.05
[kaggle][필사] New York City Taxi Duration (2) (0)	2020.10.04

춤추는 개발자

[kaggle][필사] TensorFlow Speech Recognition Challenge (1)

1. Visualization of recordings - input features

1.1. Wave and spectrogram

1.2. MFCC

1.3. Sprectrogram in 3d

1.4. Silence removal

1.5. Resampling - dimensionality reductions

1.6. Features extraction steps

'Competition > Kaggle' 카테고리의 다른 글

'Competition/Kaggle'의 다른글

티스토리툴바

[kaggle][필사] TensorFlow Speech Recognition Challenge (1)

1. Visualization of recordings - input features

1.1. Wave and spectrogram

1.2. MFCC

1.3. Sprectrogram in 3d

1.4. Silence removal

1.5. Resampling - dimensionality reductions

1.6. Features extraction steps

'Competition > Kaggle' 카테고리의 다른 글

'Competition/Kaggle'의 다른글

관련글

티스토리툴바