add test download dataset
This commit is contained in:
+1
-1
@@ -9,7 +9,7 @@ __pycache__/
|
|||||||
data/model_small/
|
data/model_small/
|
||||||
data/model_large/
|
data/model_large/
|
||||||
data/v4_ru.pt
|
data/v4_ru.pt
|
||||||
wav_files/
|
MyTTSDataset/
|
||||||
vocal.wav
|
vocal.wav
|
||||||
|
|
||||||
# C extensions
|
# C extensions
|
||||||
|
|||||||
@@ -76,5 +76,6 @@ home_assistant_execute:
|
|||||||
- включи телевизор
|
- включи телевизор
|
||||||
- выключи телевизор
|
- выключи телевизор
|
||||||
- начни уборку
|
- начни уборку
|
||||||
|
- убрать мою комнату
|
||||||
home_assistant_get:
|
home_assistant_get:
|
||||||
- тест
|
- тест
|
||||||
Generated
+2813
-48
File diff suppressed because it is too large
Load Diff
+2
-1
@@ -6,7 +6,7 @@ authors = ["dmitrium12 <belicdima8@gmail.com>"]
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.11"
|
python = ">=3.11,<3.12"
|
||||||
vosk = "^0.3.45"
|
vosk = "^0.3.45"
|
||||||
pvporcupine = "^3.0.1"
|
pvporcupine = "^3.0.1"
|
||||||
pvrecorder = "^1.2.1"
|
pvrecorder = "^1.2.1"
|
||||||
@@ -29,6 +29,7 @@ ruff = "^0.4.2"
|
|||||||
noisereduce = "^3.0.2"
|
noisereduce = "^3.0.2"
|
||||||
environs = "^11.0.0"
|
environs = "^11.0.0"
|
||||||
webrtcvad = "^2.0.10"
|
webrtcvad = "^2.0.10"
|
||||||
|
tts = "^0.22.0"
|
||||||
|
|
||||||
|
|
||||||
[[tool.poetry.source]]
|
[[tool.poetry.source]]
|
||||||
|
|||||||
@@ -1,41 +1,42 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# URL веб-страницы, которую нужно спарсить
|
repetition = 0
|
||||||
url = 'https://theportalwiki.com/wiki/GLaDOS_voice_lines/ru'
|
response = {}
|
||||||
|
soup = BeautifulSoup(
|
||||||
# Получаем содержимое страницы
|
requests.get('https://theportalwiki.com/wiki/GLaDOS_voice_lines/ru').text,
|
||||||
response = requests.get(url)
|
features='html.parser'
|
||||||
response.raise_for_status() # Проверка на успешный запрос
|
)
|
||||||
|
for li in soup.find_all('li'):
|
||||||
# Парсим HTML
|
try:
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
i = li.find('i').text
|
||||||
|
url = li.find('span', class_=['audio-player']).find('a')['href']
|
||||||
# Находим все теги <a>
|
if i not in response.keys():
|
||||||
links = soup.find_all('a')
|
response[i] = url
|
||||||
|
else:
|
||||||
# Фильтруем ссылки, которые заканчиваются на .wav
|
repetition += 1
|
||||||
wav_links = [
|
except AttributeError:
|
||||||
link.get('href') for link in links if link.get('href') and link.get('href').endswith('.wav')
|
pass
|
||||||
]
|
try:
|
||||||
|
i = li.find('a').text
|
||||||
# Создаем директорию для сохранения файлов, если её нет
|
url = li.find('a')['href']
|
||||||
os.makedirs('wav_files', exist_ok=True)
|
if i not in response.keys():
|
||||||
|
response[i] = url
|
||||||
# Скачиваем каждый wav-файл
|
else:
|
||||||
for wav_link in wav_links:
|
repetition += 1
|
||||||
# Получаем имя файла из URL
|
except AttributeError:
|
||||||
filename = wav_link.split('/')[-1]
|
pass
|
||||||
file_path = os.path.join('wav_files', filename)
|
print(f'Количество найденный элементов: {len(response)}')
|
||||||
|
print(f'Количество повторении: {repetition}')
|
||||||
# Скачиваем файл
|
with open('MyTTSDataset/transcript.txt', 'w') as f:
|
||||||
response = requests.get(wav_link)
|
for index, (key, value) in enumerate(response.items()):
|
||||||
response.raise_for_status()
|
try:
|
||||||
|
response = requests.get(value)
|
||||||
# Сохраняем файл
|
if response.status_code == 200:
|
||||||
with open(file_path, 'wb') as file:
|
with open(f"MyTTSDataset/wavs/wav{index}.wav", 'wb') as file:
|
||||||
file.write(response.content)
|
file.write(response.content)
|
||||||
|
f.write(f'wav{index}|{" ".join(key.split()[1:])}\n')
|
||||||
print(f"Downloaded {filename}")
|
except requests.exceptions.MissingSchema:
|
||||||
|
pass
|
||||||
|
except requests.exceptions.InvalidSchema:
|
||||||
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user