import re import requests from bs4 import BeautifulSoup def filter_string(input_string: str) -> str: allowed_chars = [] for j in "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя1234567890 !,.?-": allowed_chars.append(j) input_string = re.sub(r'^\d+.\s+', '', input_string) return ''.join([char for char in input_string if char in allowed_chars]) repetition = 0 response = {} soup = BeautifulSoup( requests.get('https://theportalwiki.com/wiki/GLaDOS_voice_lines/ru').text, features='html.parser' ) for li in soup.find_all('li'): try: i = li.find('i').text url = li.find('span', class_=['audio-player']).find('a')['href'] if i not in response.keys(): response[i] = url else: repetition += 1 except AttributeError: pass try: i = li.find('a').text url = li.find('a')['href'] if i not in response.keys(): response[i] = url else: repetition += 1 except AttributeError: pass print(f'Количество найденный элементов: {len(response)}') print(f'Количество повторении: {repetition}') with open('MyTTSDataset/transcript.txt', 'w') as f: for index, (key, value) in enumerate(response.items()): try: response = requests.get(value) if response.status_code == 200: key = filter_string(key) if key and len(key.replace(" ", "")) > 3: with open(f"MyTTSDataset/wavs/wav{index}.wav", 'wb') as file: file.write(response.content) f.write(f'wav{index}|{key}\n') except requests.exceptions.MissingSchema: pass except requests.exceptions.InvalidSchema: pass