add test download dataset

This commit is contained in:
2024-05-18 11:10:43 +07:00
parent 197dc50529
commit 9b7ca85831
5 changed files with 2856 additions and 88 deletions
+1 -1
View File
@@ -9,7 +9,7 @@ __pycache__/
data/model_small/ data/model_small/
data/model_large/ data/model_large/
data/v4_ru.pt data/v4_ru.pt
wav_files/ MyTTSDataset/
vocal.wav vocal.wav
# C extensions # C extensions
+1
View File
@@ -76,5 +76,6 @@ home_assistant_execute:
- включи телевизор - включи телевизор
- выключи телевизор - выключи телевизор
- начни уборку - начни уборку
- убрать мою комнату
home_assistant_get: home_assistant_get:
- тест - тест
Generated
+2813 -48
View File
File diff suppressed because it is too large Load Diff
+2 -1
View File
@@ -6,7 +6,7 @@ authors = ["dmitrium12 <belicdima8@gmail.com>"]
readme = "README.md" readme = "README.md"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.11" python = ">=3.11,<3.12"
vosk = "^0.3.45" vosk = "^0.3.45"
pvporcupine = "^3.0.1" pvporcupine = "^3.0.1"
pvrecorder = "^1.2.1" pvrecorder = "^1.2.1"
@@ -29,6 +29,7 @@ ruff = "^0.4.2"
noisereduce = "^3.0.2" noisereduce = "^3.0.2"
environs = "^11.0.0" environs = "^11.0.0"
webrtcvad = "^2.0.10" webrtcvad = "^2.0.10"
tts = "^0.22.0"
[[tool.poetry.source]] [[tool.poetry.source]]
+39 -38
View File
@@ -1,41 +1,42 @@
import os
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# URL веб-страницы, которую нужно спарсить repetition = 0
url = 'https://theportalwiki.com/wiki/GLaDOS_voice_lines/ru' response = {}
soup = BeautifulSoup(
# Получаем содержимое страницы requests.get('https://theportalwiki.com/wiki/GLaDOS_voice_lines/ru').text,
response = requests.get(url) features='html.parser'
response.raise_for_status() # Проверка на успешный запрос )
for li in soup.find_all('li'):
# Парсим HTML try:
soup = BeautifulSoup(response.text, 'html.parser') i = li.find('i').text
url = li.find('span', class_=['audio-player']).find('a')['href']
# Находим все теги <a> if i not in response.keys():
links = soup.find_all('a') response[i] = url
else:
# Фильтруем ссылки, которые заканчиваются на .wav repetition += 1
wav_links = [ except AttributeError:
link.get('href') for link in links if link.get('href') and link.get('href').endswith('.wav') pass
] try:
i = li.find('a').text
# Создаем директорию для сохранения файлов, если её нет url = li.find('a')['href']
os.makedirs('wav_files', exist_ok=True) if i not in response.keys():
response[i] = url
# Скачиваем каждый wav-файл else:
for wav_link in wav_links: repetition += 1
# Получаем имя файла из URL except AttributeError:
filename = wav_link.split('/')[-1] pass
file_path = os.path.join('wav_files', filename) print(f'Количество найденный элементов: {len(response)}')
print(f'Количество повторении: {repetition}')
# Скачиваем файл with open('MyTTSDataset/transcript.txt', 'w') as f:
response = requests.get(wav_link) for index, (key, value) in enumerate(response.items()):
response.raise_for_status() try:
response = requests.get(value)
# Сохраняем файл if response.status_code == 200:
with open(file_path, 'wb') as file: with open(f"MyTTSDataset/wavs/wav{index}.wav", 'wb') as file:
file.write(response.content) file.write(response.content)
f.write(f'wav{index}|{" ".join(key.split()[1:])}\n')
print(f"Downloaded {filename}") except requests.exceptions.MissingSchema:
pass
except requests.exceptions.InvalidSchema:
pass