Files
db_assistant/test.py
T
2024-05-18 11:10:43 +07:00

43 lines
1.4 KiB
Python

import requests
from bs4 import BeautifulSoup
repetition = 0
response = {}
soup = BeautifulSoup(
requests.get('https://theportalwiki.com/wiki/GLaDOS_voice_lines/ru').text,
features='html.parser'
)
for li in soup.find_all('li'):
try:
i = li.find('i').text
url = li.find('span', class_=['audio-player']).find('a')['href']
if i not in response.keys():
response[i] = url
else:
repetition += 1
except AttributeError:
pass
try:
i = li.find('a').text
url = li.find('a')['href']
if i not in response.keys():
response[i] = url
else:
repetition += 1
except AttributeError:
pass
print(f'Количество найденный элементов: {len(response)}')
print(f'Количество повторении: {repetition}')
with open('MyTTSDataset/transcript.txt', 'w') as f:
for index, (key, value) in enumerate(response.items()):
try:
response = requests.get(value)
if response.status_code == 200:
with open(f"MyTTSDataset/wavs/wav{index}.wav", 'wb') as file:
file.write(response.content)
f.write(f'wav{index}|{" ".join(key.split()[1:])}\n')
except requests.exceptions.MissingSchema:
pass
except requests.exceptions.InvalidSchema:
pass