56 lines
1.9 KiB
Python
56 lines
1.9 KiB
Python
import re
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def filter_string(input_string: str) -> str:
|
|
allowed_chars = []
|
|
for j in "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя1234567890 !,.?-":
|
|
allowed_chars.append(j)
|
|
input_string = re.sub(r'^\d+.\s+', '', input_string)
|
|
return ''.join([char for char in input_string if char in allowed_chars])
|
|
|
|
|
|
repetition = 0
|
|
response = {}
|
|
soup = BeautifulSoup(
|
|
requests.get('https://theportalwiki.com/wiki/GLaDOS_voice_lines/ru').text,
|
|
features='html.parser'
|
|
)
|
|
for li in soup.find_all('li'):
|
|
try:
|
|
i = li.find('i').text
|
|
url = li.find('span', class_=['audio-player']).find('a')['href']
|
|
if i not in response.keys():
|
|
response[i] = url
|
|
else:
|
|
repetition += 1
|
|
except AttributeError:
|
|
pass
|
|
try:
|
|
i = li.find('a').text
|
|
url = li.find('a')['href']
|
|
if i not in response.keys():
|
|
response[i] = url
|
|
else:
|
|
repetition += 1
|
|
except AttributeError:
|
|
pass
|
|
print(f'Количество найденный элементов: {len(response)}')
|
|
print(f'Количество повторении: {repetition}')
|
|
with open('MyTTSDataset/transcript.txt', 'w') as f:
|
|
for index, (key, value) in enumerate(response.items()):
|
|
try:
|
|
response = requests.get(value)
|
|
if response.status_code == 200:
|
|
key = filter_string(key)
|
|
if key and len(key.replace(" ", "")) > 3:
|
|
with open(f"MyTTSDataset/wavs/wav{index}.wav", 'wb') as file:
|
|
file.write(response.content)
|
|
f.write(f'wav{index}|{key}\n')
|
|
except requests.exceptions.MissingSchema:
|
|
pass
|
|
except requests.exceptions.InvalidSchema:
|
|
pass
|