fix download dataset
This commit is contained in:
@@ -1,6 +1,17 @@
|
||||
import re
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def filter_string(input_string: str) -> str:
|
||||
allowed_chars = []
|
||||
for j in "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя1234567890 !,.?-":
|
||||
allowed_chars.append(j)
|
||||
input_string = re.sub(r'^\d+.\s+', '', input_string)
|
||||
return ''.join([char for char in input_string if char in allowed_chars])
|
||||
|
||||
|
||||
repetition = 0
|
||||
response = {}
|
||||
soup = BeautifulSoup(
|
||||
@@ -33,9 +44,11 @@ with open('MyTTSDataset/transcript.txt', 'w') as f:
|
||||
try:
|
||||
response = requests.get(value)
|
||||
if response.status_code == 200:
|
||||
with open(f"MyTTSDataset/wavs/wav{index}.wav", 'wb') as file:
|
||||
file.write(response.content)
|
||||
f.write(f'wav{index}|{" ".join(key.split()[1:])}\n')
|
||||
key = filter_string(key)
|
||||
if key and len(key.replace(" ", "")) > 3:
|
||||
with open(f"MyTTSDataset/wavs/wav{index}.wav", 'wb') as file:
|
||||
file.write(response.content)
|
||||
f.write(f'wav{index}|{key}\n')
|
||||
except requests.exceptions.MissingSchema:
|
||||
pass
|
||||
except requests.exceptions.InvalidSchema:
|
||||
|
||||
Reference in New Issue
Block a user