fix download dataset
This commit is contained in:
@@ -1,6 +1,17 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def filter_string(input_string: str) -> str:
|
||||||
|
allowed_chars = []
|
||||||
|
for j in "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя1234567890 !,.?-":
|
||||||
|
allowed_chars.append(j)
|
||||||
|
input_string = re.sub(r'^\d+.\s+', '', input_string)
|
||||||
|
return ''.join([char for char in input_string if char in allowed_chars])
|
||||||
|
|
||||||
|
|
||||||
repetition = 0
|
repetition = 0
|
||||||
response = {}
|
response = {}
|
||||||
soup = BeautifulSoup(
|
soup = BeautifulSoup(
|
||||||
@@ -33,9 +44,11 @@ with open('MyTTSDataset/transcript.txt', 'w') as f:
|
|||||||
try:
|
try:
|
||||||
response = requests.get(value)
|
response = requests.get(value)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
|
key = filter_string(key)
|
||||||
|
if key and len(key.replace(" ", "")) > 3:
|
||||||
with open(f"MyTTSDataset/wavs/wav{index}.wav", 'wb') as file:
|
with open(f"MyTTSDataset/wavs/wav{index}.wav", 'wb') as file:
|
||||||
file.write(response.content)
|
file.write(response.content)
|
||||||
f.write(f'wav{index}|{" ".join(key.split()[1:])}\n')
|
f.write(f'wav{index}|{key}\n')
|
||||||
except requests.exceptions.MissingSchema:
|
except requests.exceptions.MissingSchema:
|
||||||
pass
|
pass
|
||||||
except requests.exceptions.InvalidSchema:
|
except requests.exceptions.InvalidSchema:
|
||||||
|
|||||||
Reference in New Issue
Block a user