-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.py
73 lines (61 loc) · 2.02 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import numpy as np
import soundfile as sf
import os
import time
from tritonclient.utils import *
from PIL import Image
import tritonclient.http as httpclient
from datasets import load_dataset
import librosa
import soundfile
import jiwer
import pandas as pd
import sys
import json
def transform_path_to_filename(path):
"""
Transform a path by:
1. Extracting the portion after 'extracted/'
2. Converting '/' to '-'
3. Creating a single filename
Args:
path (str): The input path
Returns:
str: Transformed filename
"""
# Split the path by 'extracted/' and take the second part
extracted_part = path.split('extracted/')[-1]
# Replace all '/' with '-'
transformed = extracted_part.replace('/', '-')
transformed = transformed.replace('.mp3', '.wav')
return transformed
if __name__ == "__main__":
if len(sys.argv) < 1:
print("Usage: python preprocessing.py <samples>")
sys.exit(1)
samples = int(sys.argv[1])
dataset_name = 'mozilla-foundation/common_voice_11_0'
split = 'test'
dataset = load_dataset(dataset_name, "en", split=split, trust_remote_code=True)
print(dataset)
initial_time = time.time()
dir = "handlers/triton/audio"
if not os.path.exists(dir):
os.makedirs(dir)
op = {}
for idx, audio_file in enumerate(dataset):
if samples == 0:
break
audio = audio_file['audio']['array']
print(f'{samples}\n\n\n')
print("before", audio)
start = time.time()
audio = librosa.resample(audio, orig_sr=audio_file['audio']['sampling_rate'], target_sr=16000)
print(os.path.split(audio_file["path"]))
op[transform_path_to_filename(audio_file["path"])] = audio_file["sentence"]
output_path = f'{dir}/{transform_path_to_filename(audio_file["path"])}'
sf.write(output_path, audio, 16000)
print("after: ", audio)
samples = samples - 1
with open('map.json', 'w') as f:
json.dump(op, f)