Commit 392e450a authored by Paul Bethge's avatar Paul Bethge
Browse files

add noise dl, process and split script

parent ad473caa
AUDIOSET_CSV="balanced.csv"
YTDL_DIR=$PWD/"yt-noise"
YTDL_DIR=$PWD/"__noise"
NOISE_DIR=$PWD/"__noise"
CV_DIR="$PWD/data/cv"
python3 data/audioset/download_youtube_noise.py --input_file $AUDIOSET_CSV --output_dir $YTDL_DIR
python3 data/other/cut_audio.py --input_dir $YTDL_DIR --output_dir
\ No newline at end of file
python3 data/other/cut_audio.py --input_dir $YTDL_DIR --output_dir $NOISE_DIR
python3 data/other/split_to_common_voice --input_dir $NOISE_DIR --output_dir $CV_DIR
rm -r $NOISE_DIR
\ No newline at end of file
......@@ -12,7 +12,7 @@ from shutil import copyfile, move
import argparse
def create_small_dataset(path, new_path, dataset_size):
def create_small_dataset(path, new_path, dataset_size, move_it=True):
parent_list = os.listdir(path)
for child in parent_list:
......@@ -34,20 +34,25 @@ def create_small_dataset(path, new_path, dataset_size):
if dataset_size != -1 or count < dataset_size:
file_path = os.path.join(subdir_path, file)
new_file_path = os.path.join(new_subdir_path, file)
copyfile(file_path, new_file_path)
# print(file_path, new_file_path)
if move_it:
move(file_path, new_file_path)
else:
copyfile(file_path, new_file_path)
else:
break
count = count + 1
print("copyied or moved" + str(count) + "files")
if move_it:
print("moved " + str(count) + " files")
else:
print("copyied " + str(count) + " files")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--source', required=True)
parser.add_argument('--target', required=True)
parser.add_argument('--source', required=True, help="input directory")
parser.add_argument('--target', required=True, help="output directory")
parser.add_argument('--size', type=int, default=10, help="put -1 for all")
parser.add_argument('--move', type=bool, default=True, help="whether to move or just copy")
cli_args = parser.parse_args()
create_small_dataset(cli_args.source, cli_args.target, cli_args.size)
create_small_dataset(cli_args.source, cli_args.target, cli_args.size, cli_args.move)
"""
:author:
Paul Bethge (bethge@zkm.de)
2021
:License:
This package is published under Simplified BSD License.
"""
import os
import glob
from shutil import copyfile, move
import argparse
def split_dataset_to_cv_dir(src, dest, split, target_name, move_it=True):
file_paths = glob.glob(os.path.join(src, '*.wav'))
num_files = len(file_paths)
for i,file_path in enumerate(file_paths):
if i / num_files <= split[0]:
dest_split = "train"
elif i / num_files <= split[0] + split[1]:
dest_split = "test"
else:
dest_split = "dev"
dest_dir = os.path.join(dest, dest_split, target_name)
# create output dir
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
file_name = file_path.split('/')[-1]
new_file_path = os.path.join(dest_dir, file_name)
if move_it:
move(file_path, new_file_path)
else:
copyfile(file_path, new_file_path)
if move:
print("moved " + str(i) + " files")
else:
print("copyied " + str(i) + " files")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input_dir', required=True, help="noise directory")
parser.add_argument('--output_dir', required=True, help="common voice root directory")
parser.add_argument('--target_name', default="__noise", help="class and directory name")
parser.add_argument('--ratio', type=float, nargs=3, default=[0.8,0.1,0.1], help="put -1 for all")
parser.add_argument('--move', type=bool, default=True, help="whether to move or just copy")
cli_args = parser.parse_args()
split_sum = cli_args.ratio[0] + cli_args.ratio[1] + cli_args.ratio[2]
assert(split_sum == 1.0)
split_dataset_to_cv_dir(cli_args.input_dir, cli_args.output_dir, cli_args.ratio, cli_args.target_name, cli_args.move)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment