Update .gitignore to include .flac files, remove docker-compose.yml, modify batch_confirm_hf.py for improved error handling and dataset upload logic, and adjust Dockerfile paths for service code and model directory.
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -23,3 +23,5 @@ confirmed_dataset/
|
|||||||
|
|
||||||
# Kiro files
|
# Kiro files
|
||||||
*.kiro
|
*.kiro
|
||||||
|
|
||||||
|
*.flac
|
||||||
@@ -1,7 +1,8 @@
|
|||||||
version: '3.8'
|
|
||||||
services:
|
services:
|
||||||
vosk:
|
vosk:
|
||||||
build: ./vosk_service
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: vosk_service/Dockerfile
|
||||||
container_name: vosk-api
|
container_name: vosk-api
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "5000:5000"
|
||||||
@@ -14,7 +14,7 @@ from huggingface_hub import HfApi, create_repo
|
|||||||
print("Loading dataset...")
|
print("Loading dataset...")
|
||||||
ds = load_dataset(
|
ds = load_dataset(
|
||||||
"Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
|
"Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
|
||||||
split="validated[:500]",
|
split="validated[:5]",
|
||||||
streaming=False
|
streaming=False
|
||||||
).cast_column("audio", Audio(sampling_rate=16000))
|
).cast_column("audio", Audio(sampling_rate=16000))
|
||||||
|
|
||||||
@@ -127,10 +127,11 @@ if confirmed:
|
|||||||
# Push to Hugging Face Hub
|
# Push to Hugging Face Hub
|
||||||
print(f"\n🚀 Pushing dataset to Hugging Face Hub as '{HF_DATASET_NAME}'...")
|
print(f"\n🚀 Pushing dataset to Hugging Face Hub as '{HF_DATASET_NAME}'...")
|
||||||
try:
|
try:
|
||||||
# Initialize HF API
|
# Initialize HF API with token
|
||||||
api = HfApi()
|
api = HfApi(token=os.getenv("HF_TOKEN"))
|
||||||
|
|
||||||
# Create the repository (private if specified)
|
# Create the repository (private if specified)
|
||||||
|
repo_created = False
|
||||||
try:
|
try:
|
||||||
create_repo(
|
create_repo(
|
||||||
repo_id=HF_DATASET_NAME,
|
repo_id=HF_DATASET_NAME,
|
||||||
@@ -139,52 +140,46 @@ if confirmed:
|
|||||||
exist_ok=True
|
exist_ok=True
|
||||||
)
|
)
|
||||||
print(f"✅ Repository '{HF_DATASET_NAME}' created/verified")
|
print(f"✅ Repository '{HF_DATASET_NAME}' created/verified")
|
||||||
|
repo_created = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ Repository creation: {e}")
|
print(f"⚠️ Repository creation failed: {e}")
|
||||||
|
print("💡 Please create the repository manually on Hugging Face Hub first")
|
||||||
|
print(f"💡 Or change HF_DATASET_NAME to use your own username")
|
||||||
|
print("💡 Skipping upload due to repository creation failure")
|
||||||
|
|
||||||
# Upload all parquet files
|
if not repo_created:
|
||||||
for i in range(num_shards):
|
print("💡 Skipping upload due to repository creation failure")
|
||||||
shard_path = os.path.join(output_dir, f"confirmed_shard_{i:02}.parquet")
|
else:
|
||||||
if os.path.exists(shard_path):
|
# Create dataset info file
|
||||||
api.upload_file(
|
dataset_info = {
|
||||||
path_or_fileobj=shard_path,
|
"dataset_name": HF_DATASET_NAME,
|
||||||
path_in_repo=f"confirmed_shard_{i:02}.parquet",
|
"description": "Persian Common Voice confirmed samples for Whisper fine-tuning",
|
||||||
repo_id=HF_DATASET_NAME,
|
"total_samples": len(confirmed),
|
||||||
repo_type="dataset"
|
"num_shards": num_shards,
|
||||||
)
|
"audio_format": "int16 PCM, 16kHz",
|
||||||
print(f"📤 Uploaded shard {i+1}/{num_shards}")
|
"columns": ["audio", "text"],
|
||||||
|
"source_dataset": "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
|
||||||
|
"processing": "Vosk API batch confirmation"
|
||||||
|
}
|
||||||
|
|
||||||
# Create dataset info file
|
# Save dataset info to output directory
|
||||||
dataset_info = {
|
info_path = os.path.join(output_dir, "dataset_info.json")
|
||||||
"dataset_name": HF_DATASET_NAME,
|
with open(info_path, 'w', encoding='utf-8') as f:
|
||||||
"description": "Persian Common Voice confirmed samples for Whisper fine-tuning",
|
json.dump(dataset_info, f, indent=2, ensure_ascii=False)
|
||||||
"total_samples": len(confirmed),
|
|
||||||
"num_shards": num_shards,
|
|
||||||
"audio_format": "int16 PCM, 16kHz",
|
|
||||||
"columns": ["audio", "text"],
|
|
||||||
"source_dataset": "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
|
|
||||||
"processing": "Vosk API batch confirmation"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Upload dataset info
|
# Upload entire folder using upload_folder
|
||||||
import tempfile
|
api.upload_folder(
|
||||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
|
folder_path=output_dir,
|
||||||
json.dump(dataset_info, f, indent=2, ensure_ascii=False)
|
repo_id=HF_DATASET_NAME,
|
||||||
info_path = f.name
|
repo_type="dataset",
|
||||||
|
)
|
||||||
|
|
||||||
api.upload_file(
|
print(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}")
|
||||||
path_or_fileobj=info_path,
|
|
||||||
path_in_repo="dataset_info.json",
|
|
||||||
repo_id=HF_DATASET_NAME,
|
|
||||||
repo_type="dataset"
|
|
||||||
)
|
|
||||||
os.unlink(info_path)
|
|
||||||
|
|
||||||
print(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Failed to push to Hugging Face: {e}")
|
print(f"❌ Failed to push to Hugging Face: {e}")
|
||||||
print("💡 Make sure you're logged in with: huggingface-cli login")
|
print("💡 Make sure you have HF_TOKEN environment variable set")
|
||||||
|
print("💡 Set it with: export HF_TOKEN=your_token_here")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("❌ No confirmed samples to save")
|
print("❌ No confirmed samples to save")
|
||||||
@@ -8,14 +8,14 @@ RUN apt-get update && apt-get install -y \
|
|||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install Python dependencies
|
# Install Python dependencies
|
||||||
COPY requirements.txt ./
|
COPY vosk_service/requirements.txt ./
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
# Copy service code
|
# Copy service code
|
||||||
COPY app.py ./
|
COPY vosk_service/app.py ./
|
||||||
|
|
||||||
# Copy model directory
|
# Copy model directory
|
||||||
COPY model/ ./model/
|
COPY vosk-model-fa-0.42/ ./model/
|
||||||
|
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user