Commit a30fa88b authored by Paul Bethge's avatar Paul Bethge
Browse files

Merge branch 'simplify' of...

Merge branch 'simplify' of git.zkm.de:Hertz-Lab/Research/intelligent-museum/language-identification into simplify
parents d5d71668 12f74332
......@@ -17,6 +17,8 @@ parallelize: True
remove_raw: True
language_table:
- lang: "chinese"
dir: "zh-CN"
- lang: "english"
dir: "en"
- lang: "farsi"
......@@ -25,10 +27,10 @@ language_table:
dir: "fr"
- lang: "german"
dir: "de"
- lang: "italian"
dir: "it"
- lang: "polish"
dir: "pl"
- lang: "mandarin"
dir: "zh-CN"
- lang: "russian"
dir: "ru"
- lang: "spanish"
......
......@@ -114,8 +114,7 @@ def traverse_csv(language, input_dir, output_dir, max_chops,
rand_int = np.random.randint(low=0, high=2)
padding_choice = ["Data", "Silence"][rand_int]
if vad:
if use_vad:
chips = vad.chop_from_file(wav_path_raw, padding=padding_choice)
else:
chips = chop_up_audio (wav_path_raw, padding=padding_choice,
......@@ -123,8 +122,10 @@ def traverse_csv(language, input_dir, output_dir, max_chops,
min_length_s=min_length_s, max_silence_s=max_silence_s,
threshold=energy_threshold)
for chip_name, chip_fs, chip_data in chips:
if chip_data.dtype == "float32":
chip_data = chip_data * 32768
chip_data = chip_data.astype("int16")
wav_path = os.path.join(output_dir_wav, chip_name + ".wav")
wav.write(wav_path, chip_fs, chip_data)
produced_files += 1
......@@ -155,9 +156,9 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--config_path', default=None,
help="path to the config yaml file. When given, arguments will be ignored")
parser.add_argument("--cv_input_dir", type=str, default=None,
parser.add_argument("--cv_input_dir", type=str, default="common_voice",
help="directory containing all languages")
parser.add_argument("--cv_output_dir", type=str, default="../res",
parser.add_argument("--cv_output_dir", type=str, default="common_voice_processed",
help="directory to receive converted clips of all languages")
# Data
parser.add_argument("--max_chops", type=int, nargs=3, default=[-1, -1, -1],
......@@ -189,6 +190,7 @@ if __name__ == '__main__':
# overwrite arguments when config is given
if args.config_path:
config = load(open(args.config_path, "rb"))
if config is None:
print("Could not find config file")
exit(-1)
......@@ -205,6 +207,7 @@ if __name__ == '__main__':
args.sample_width = config["sample_width"]
args.parallelize = config["parallelize"]
args.remove_raw = config["remove_raw"]
args.use_vad = config["use_vad"]
language_table = config["language_table"]
# copy config to output dir
......
......@@ -12,7 +12,7 @@ import os
import time
import auditok
from .utils import pad_with_data, pad_with_noise, pad_with_silence, to_array
from src.audio.utils import pad, to_array
def chop_up_audio (file_name, desired_length_s = 5,
......@@ -35,16 +35,8 @@ def chop_up_audio (file_name, desired_length_s = 5,
# extend tokens to desired length
audio_cuttings = []
for i, r in enumerate(regions):
numpy_data = to_array(r._data, 2, 1)
if padding == "Silence":
extended_token = pad_with_silence(numpy_data, nn_input_len)
elif padding == "Data":
extended_token = pad_with_data(numpy_data, nn_input_len)
else:
extended_token = pad_with_noise(numpy_data, nn_input_len)
extended_token = pad(numpy_data, nn_input_len, padding)
file_name_out = os.path.split(file_name)[-1][:-4] + "_" + str(i)
data_tuple = (file_name_out, sample_rate, extended_token)
audio_cuttings.append(data_tuple)
......
......@@ -719,7 +719,7 @@ def drop_chunks(tss: List[dict],
import os
from src.audio.utils import pad_with_data, pad_with_noise, pad_with_silence, to_array
from src.audio.utils import pad
from src.audio.utils import LogicDataSource, LogicValidater
from auditok.core import StreamTokenizer
......@@ -759,20 +759,11 @@ class VADTokenizer():
# reconstruct audio regions from index regions
regions = [wav[index[1]*self.vad_resolution: index[2]*self.vad_resolution] for index in indices]
# extend tokens to desired length
audio_cuttings = []
for i, r in enumerate(regions):
numpy_data = r.cpu().detach().numpy()
if padding == "Silence":
extended_token = pad_with_silence(numpy_data, self.nn_input_len)
elif padding == "Data":
extended_token = pad_with_data(numpy_data, self.nn_input_len)
else:
extended_token = pad_with_noise(numpy_data, self.nn_input_len)
extended_token = pad(numpy_data, self.nn_input_len, padding)
file_name_out = os.path.split(file_path)[-1][:-4] + "_" + str(i)
data_tuple = (file_name_out, self.sample_rate, extended_token)
audio_cuttings.append(data_tuple)
......
......@@ -30,7 +30,7 @@ def pad_with_silence(data, max_len):
def pad_with_data(data, max_len):
to_add = max(max_len - len(data), 0)
padded = np.zeros(shape=(max_len,), dtype="int16")
padded = np.zeros(shape=(max_len,), dtype="float32")
if to_add:
repeat = int(max_len / len(data))
rest = max_len % len(data)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment