Ryuk17
diff --git a/‎AcousticEchoCancellation/LMS.py
+116 b/‎AcousticEchoCancellation/LMS.py
+116
diff --git a/‎AcousticEchoCancellation/README.md
+1 b/‎AcousticEchoCancellation/README.md
+1
diff --git a/‎AcousticEchoCancellation/far.wav
1.52 MB b/‎AcousticEchoCancellation/far.wav
1.52 MB
diff --git a/‎AcousticEchoCancellation/near.wav
1.52 MB b/‎AcousticEchoCancellation/near.wav
1.52 MB
diff --git a/‎AcousticEchoCancellation/output.wav
3.04 MB b/‎AcousticEchoCancellation/output.wav
3.04 MB
diff --git a/‎CommandRecognition/README.md
+3 b/‎CommandRecognition/README.md
+3
diff --git a/‎EnvironmentSoundClassification/README.md
+1 b/‎EnvironmentSoundClassification/README.md
+1
diff --git a/‎EnvironmentSoundClassification/baseline.py
+88 b/‎EnvironmentSoundClassification/baseline.py
+88
diff --git a/‎GenderClassify/README.md
+3 b/‎GenderClassify/README.md
+3
diff --git a/‎GenderClassify/test_vad.txt
-1 b/‎GenderClassify/test_vad.txt
-1
diff --git a/‎Resample/README.md
+1 b/‎Resample/README.md
+1
diff --git a/‎SoundSourceLocalization/README.md
+1 b/‎SoundSourceLocalization/README.md
+1
diff --git a/‎SoundSourceLocalization/SoundSourceLocalization.py
+64 b/‎SoundSourceLocalization/SoundSourceLocalization.py
+64
diff --git a/‎SoundSourceLocalization/data/ref.wav
1.4 MB b/‎SoundSourceLocalization/data/ref.wav
1.4 MB
diff --git a/‎SoundSourceLocalization/data/sig.wav
1.4 MB b/‎SoundSourceLocalization/data/sig.wav
1.4 MB
diff --git a/‎SpectralSubtraction/README.md
+1 b/‎SpectralSubtraction/README.md
+1
diff --git a/‎SpeechAugmentation/README.md
+1 b/‎SpeechAugmentation/README.md
+1
diff --git a/‎SpeechAugmentation/addNoise.py
+92 b/‎SpeechAugmentation/addNoise.py
+92
diff --git a/‎SpeechAugmentation/clean.wav
156 KB b/‎SpeechAugmentation/clean.wav
156 KB
diff --git a/‎SpeechAugmentation/echoic.wav
312 KB b/‎SpeechAugmentation/echoic.wav
312 KB
diff --git a/‎SpeechAugmentation/howling.wav
312 KB b/‎SpeechAugmentation/howling.wav
312 KB
diff --git a/‎SpeechAugmentation/noisy.wav
312 KB b/‎SpeechAugmentation/noisy.wav
312 KB
@@ -0,0 +1,116 @@
+"""
+@FileName: LMS.py
+@Description: Implement LMS AEC
+@Author: Ryuk
+@CreateDate: 2020/07/01
+@LastEditTime: 2020/07/04
+@LastEditors: Please set LastEditors
+@Version: v0.1
+"""
+
+import librosa
+import numpy as np
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+
+
+far, sr = librosa.load("./far.wav", sr=16000)
+near, sr = librosa.load("./near.wav", sr=16000)
+
+L = 128                 # 滤波器抽头系数
+N = len(far)            # 迭代长度
+T = 0.92                # 双端检测阈值
+lambda_DTD=0.95         # DTD更新系数
+DTDbegin=20000          # DTD 开始检测时间
+
+w = np.zeros(L)
+xin = np.zeros(L)
+
+# DTD相关参数
+varMIC = np.zeros(N)
+r_em = np.zeros(N)
+
+x = far
+d = near
+
+mu = 0.014
+y = np.zeros(N)
+e = np.zeros(N)
+threshold = np.zeros(N)
+decision_statistic = np.zeros(N)
+powerD = np.zeros(N)
+powerE = np.zeros(N)
+ERLE = np.zeros(N)
+
+for i in tqdm(range(N)):
+    for j in range(L - 1, 0, -1):
+        xin[j] = xin[j - 1]
+
+    # LMS
+    xin[0] = x[i]
+
+    y[i] = np.dot(w, xin)
+    error = d[i] - y[i]
+    e[i] = error
+    wtemp = w + np.multiply(2 * mu * error, xin)
+
+    # DTD
+    threshold[i] = T
+    if i < DTDbegin:
+        w = wtemp
+    else:
+        r_em[i] = lambda_DTD * (r_em[i - 1]) + (1 - lambda_DTD) * e[i] * d[i]
+        varMIC[i] = np.sqrt(lambda_DTD * (varMIC[i - 1] ** 2) + (1 - lambda_DTD) * d[i] * d[i])
+        decision_statistic[i] = 1 - (r_em[i] / varMIC[i]) ** 2
+
+    if decision_statistic[i] > threshold[i]:
+        w = wtemp
+
+    # ERLE
+    powerD[i] = np.abs(d[i]) ** 2 # Power of Microphone signal
+    powerE[i] = np.abs(e[i]) ** 2 # power of Error signal
+    ERLE[i]=10 * np.log10(np.mean(powerD[i:i+L])/np.mean(powerE[i:i+L]))
+
+# 画图
+time = np.arange(0, len(far)) * (1.0 / sr)
+fig = plt.figure(figsize=(10, 16))
+
+# near
+ax = fig.add_subplot(6, 1, 1)
+ax.plot(time, near,'b')
+plt.ylabel("Near")
+plt.tight_layout()
+
+# far
+ax = fig.add_subplot(6, 1, 2)
+ax.plot(time, far, 'b')
+plt.ylabel("Far")
+plt.tight_layout()
+
+# output
+ax = fig.add_subplot(6, 1, 3)
+ax.plot(time, y, 'b')
+plt.ylabel("Output")
+plt.tight_layout()
+
+# error
+ax = fig.add_subplot(6, 1, 4)
+ax.plot(time, e, 'b')
+plt.ylabel("Error")
+plt.tight_layout()
+
+# Decision_statistic
+ax = fig.add_subplot(6, 1, 5)
+ax.plot(time, decision_statistic, 'b')
+plt.axhline(y=T,ls=":",c="red")
+plt.ylabel("Decision_statistic")
+plt.tight_layout()
+
+# ERLE
+ax = fig.add_subplot(6, 1, 6)
+ax.plot(time, ERLE, 'b')
+plt.ylabel("ERLE")
+plt.tight_layout()
+
+plt.show()
+librosa.output.write_wav("./output.wav", (near - y).astype(np.float32), sr)
@@ -0,0 +1 @@
+This is a simply acoustic echo cancellation demo with LMS filter.
@@ -0,0 +1,3 @@
+Command Recognition uses CNN
+
+usage: python train.py or sh train.sh
@@ -0,0 +1 @@
+Environment sound classification baseline use XGboost as classfier. There are three features including MFCC, spectral contrast and  ZCR
@@ -0,0 +1,88 @@
+"""
+@FileName: baseline.py
+@Description: Implement baseline for ESC
+@Author: Ryuk
+@CreateDate: 2020/08/02
+@LastEditTime: 2020/08/02
+@LastEditors: Please set LastEditors
+@Version: v0.1
+"""
+
+
+import os
+import librosa
+import numpy as np
+from tqdm import tqdm
+from sklearn.model_selection import train_test_split
+from xgboost import XGBClassifier
+from sklearn.metrics import accuracy_score
+
+RATE = 44100
+FRAME = 512
+NUM = 400
+
+def compute_mfcc(wav):
+    melspectrogram = librosa.feature.melspectrogram(wav, sr=RATE, hop_length=FRAME)
+    logamplitude = librosa.core.amplitude_to_db(melspectrogram)
+    mfcc = np.mean(librosa.feature.mfcc(S=logamplitude, n_mfcc=13),axis=1)
+    return mfcc
+
+def compute_spectral_contrast(wav):
+    spectral_contrast = librosa.feature.spectral_contrast(wav, RATE, hop_length=512)
+    spectral_contrast = np.mean(spectral_contrast, axis=1)
+    return spectral_contrast
+
+def compute_zcr(wav):
+    zcr = []
+    frames = librosa.util.frame(wav,hop_length=FRAME)
+    frames = frames.T
+    for i in range(len(frames)):
+        zcr.append(np.mean(0.5 * np.abs(np.diff(np.sign(frames[i])))))
+
+    zcr_mean = np.mean(zcr)
+    zcr_std = np.std(zcr)
+    return np.array([zcr_mean, zcr_std])
+
+
+path = "D:\\Samples\\ESC-10\\"
+folders = os.listdir(path)[:10]
+print(folders)
+
+feature = np.zeros([NUM, 22])
+label = np.zeros(NUM)
+count = 0
+for i in tqdm(range(len(folders))):
+    dirname = path + str(folders[i])
+    wavefiles = os.listdir(dirname)
+
+    for j in range(len(wavefiles)):
+        wavname = dirname + "\\" + wavefiles[j]
+        wav,RATE = librosa.load(wavname, RATE)
+        mfcc = compute_mfcc(wav)
+        sc = compute_spectral_contrast(wav)
+        zcr = compute_zcr(wav)
+        feature[count] = np.hstack([mfcc, sc, zcr])
+        label[count] = i
+        count += 1
+
+# training
+print("Start Training")
+X_train,X_test, y_train, y_test = train_test_split(feature,label,test_size=0.3, random_state=2020)
+clf = XGBClassifier(
+    learning_rate=0.1,
+    n_estimators=100,
+    max_depth=6,
+    min_child_weight = 1,
+    gamma=0.,
+    subsample=0.8,
+    colsample_btree=0.8,
+    objective='multi:softmax',
+    scale_pos_weight=1,
+    random_state=2020
+)
+clf.fit(X_train, y_train)
+
+print("Start Testing")
+y_pred = clf.predict(X_test)
+accuracy = accuracy_score(y_test, y_pred)
+print("accuarcy: %.2f%%" % (accuracy * 100.0))
@@ -0,0 +1,3 @@
+Gender classify use bi-LSTM with VAD.
+
+usage: python train.py
@@ -0,0 +1 @@
+Resample speech with interpolation, Lagrange interpolation and sine interpolation.
@@ -0,0 +1 @@
+Sound source localization employs TDOA-based algorithm.
@@ -0,0 +1,64 @@
+"""
+@FileName: SoundSourceLocalization.py
+@Description: Implement SoundSourceLocalization
+@Author: Ryuk
+@CreateDate: 2020/09/01
+@LastEditTime: 2020/09/06
+@LastEditors: Please set LastEditors
+@Version: v0.1
+"""
+
+import numpy as np
+import librosa
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+
+
+def gcc_phat(ref, sig, sr):
+    n_point = 2 * ref.shape[0] - 1
+    X = np.fft.fft(ref, n_point)
+    Y = np.fft.fft(sig, n_point)
+    XY = X * np.conj(Y)
+
+    c = XY / (abs(X) * abs(Y) + 10e-6)
+    c = np.real(np.fft.ifft(c))
+    end = len(c)
+    center_point = end // 2
+	
+	#fft shift
+    c = np.hstack((c[center_point + 1:], c[:center_point + 1]))
+    lag = np.argmax(abs(c)) - len(ref) + 1
+    tau = lag / sr
+    return tau
+
+
+SOUND_SPEED = 340.0
+MIC_DISTANCE = 0.15
+sample_rate = 48000
+MAX_TDOA = MIC_DISTANCE / float(SOUND_SPEED)
+
+org_ref, sr = librosa.load("./data/ref.wav", sr=sample_rate)
+org_sig, sr = librosa.load("./data/sig.wav", sr=sample_rate)
+
+
+ref = librosa.util.frame(org_ref, 1024, 256).T
+sig = librosa.util.frame(org_sig, 1024, 256).T
+fai = []
+for i in tqdm(range(len(ref))):
+    tau = gcc_phat(ref[i], sig[i], sample_rate)
+    theta = np.arcsin(tau / MAX_TDOA) * 180 / np.pi
+    fai.append(theta)
+
+
+plt.subplot(211)
+plt.ylabel('DOA ')
+plt.xlabel('Frame index')
+plt.title('DOA')
+plt.plot(fai)
+plt.subplot(212)
+plt.ylabel('Amplitude')
+plt.xlabel('Frame index')
+plt.title('Waveform')
+plt.plot(org_ref)
+plt.tight_layout()
+plt.show()
@@ -0,0 +1 @@
+Spectral substraction algorithm for noise reduction.
@@ -0,0 +1 @@
+Add echo, noise, howling and reverber into clean speech.
@@ -0,0 +1,92 @@
+"""
+@FileName: addNoise.py
+@Description: Implement addNoise
+@Author: Ryuk
+@CreateDate: 2020/05/11
+@LastEditTime: 2020/05/11
+@LastEditors: Please set LastEditors
+@Version: v0.1
+"""
+import numpy as np
+from scipy import signal
+from numpy.linalg import norm
+
+
+def add_noise(clean, noise, snr):
+    if len(noise) > len(clean):
+        noise = noise[:len(clean)]
+    else:
+        times = len(clean) // len(noise)
+        noise = np.tile(noise, times)
+        padding = [0] * (len(clean) - len(noise))
+        noise = np.hstack([noise, padding])
+
+    noise = noise / norm(noise) * norm(clean) / (10.0 ** (0.05 * snr))
+    mix = clean + noise
+    return mix
+
+
+def addEcho(clean, sr, alpha, beta=0.5, delay=0.1, type=1):
+    """
+    add echo signal to raw speech
+    :param clean: clean speech
+    :param sr: sample rate
+    :param alpha: parameters for type1
+    :param beta: parameters for type2
+    :param delay: parameters for type2
+    :param type: echo type
+    :return: mix signal
+    """
+    if type == 1:
+        h = [1]
+        h.extend([0] * int(alpha * sr))
+        h.extend([0.5])
+        h.extend([0] * int(alpha * sr))
+        h.extend([0.25])
+        mix = signal.convolve(clean, h)
+    else:
+        mix = clean.copy()
+        shift = int(delay * sr)
+        for i in range(shift, len(clean)):
+            mix[i] = beta * clean[i] + (1 - beta) * clean[i - shift]
+    return mix
+
+
+
+def add_reverberation(clean, alpha=0.8, R=2000):
+    b = [0] * (R+1)
+    b[0], b[-1] = alpha, 1
+    a = [0] * (R+1)
+    a[0], a[-1] = 1, 0.5
+    mix = signal.filtfilt(b, a, clean)
+    return mix
+
+def add_howl(clean, K=0.2):
+    g = np.loadtxt("./path.txt")
+    c = np.array([0,0,0,0,1]).T
+    h = np.zeros(201)
+    h[100] = 1
+
+    xs1 = np.zeros(c.shape[0])
+    xs2 = np.zeros(g.shape)
+    xs3 = np.zeros(h.shape)
+
+    mix = np.zeros(len(clean))
+    temp = 0
+
+    for i in range(len(clean)):
+        xs1[1:] = xs1[: - 1]
+        xs1[0] =  clean[i] + temp
+        mix[i] = K * np.dot(xs1.T, c)
+
+        xs3[1:] = xs3[: - 1]
+        xs3[0] = mix[i]
+        mix[i] = np.dot(xs3.T, h)
+
+        mix[i] = min(1, mix[i])
+        mix[i] = max(-1, mix[i])
+
+        xs2[1:] = xs2[: - 1]
+        xs2[0] = mix[i]
+        temp = np.dot(xs2.T, g)
+    return mix
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+This is a simply acoustic echo cancellation demo with LMS filter.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Command Recognition uses CNN`
	`2`	`+`
	`3`	`+usage: python train.py or sh train.sh`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Environment sound classification baseline use XGboost as classfier. There are three features including MFCC, spectral contrast and ZCR`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Gender classify use bi-LSTM with VAD.`
	`2`	`+`
	`3`	`+usage: python train.py`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Resample speech with interpolation, Lagrange interpolation and sine interpolation.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Sound source localization employs TDOA-based algorithm.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Spectral substraction algorithm for noise reduction.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add echo, noise, howling and reverber into clean speech.`