Update .gitignore to include .flac files, remove docker-compose.yml, modify batch_confirm_hf.py for improved error handling and dataset upload logic, and adjust Dockerfile paths for service code and model directory.

This commit is contained in:
Alireza
2025-08-02 12:21:28 +03:30
parent 640363fef2
commit d16f4a84bb
4 changed files with 47 additions and 49 deletions

2
.gitignore vendored
View File

@@ -23,3 +23,5 @@ confirmed_dataset/
# Kiro files # Kiro files
*.kiro *.kiro
*.flac

View File

@@ -1,7 +1,8 @@
version: '3.8'
services: services:
vosk: vosk:
build: ./vosk_service build:
context: .
dockerfile: vosk_service/Dockerfile
container_name: vosk-api container_name: vosk-api
ports: ports:
- "5000:5000" - "5000:5000"

View File

@@ -14,7 +14,7 @@ from huggingface_hub import HfApi, create_repo
print("Loading dataset...") print("Loading dataset...")
ds = load_dataset( ds = load_dataset(
"Ashegh-Sad-Warrior/Persian_Common_Voice_17_0", "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
split="validated[:500]", split="validated[:5]",
streaming=False streaming=False
).cast_column("audio", Audio(sampling_rate=16000)) ).cast_column("audio", Audio(sampling_rate=16000))
@@ -127,10 +127,11 @@ if confirmed:
# Push to Hugging Face Hub # Push to Hugging Face Hub
print(f"\n🚀 Pushing dataset to Hugging Face Hub as '{HF_DATASET_NAME}'...") print(f"\n🚀 Pushing dataset to Hugging Face Hub as '{HF_DATASET_NAME}'...")
try: try:
# Initialize HF API # Initialize HF API with token
api = HfApi() api = HfApi(token=os.getenv("HF_TOKEN"))
# Create the repository (private if specified) # Create the repository (private if specified)
repo_created = False
try: try:
create_repo( create_repo(
repo_id=HF_DATASET_NAME, repo_id=HF_DATASET_NAME,
@@ -139,21 +140,16 @@ if confirmed:
exist_ok=True exist_ok=True
) )
print(f"✅ Repository '{HF_DATASET_NAME}' created/verified") print(f"✅ Repository '{HF_DATASET_NAME}' created/verified")
repo_created = True
except Exception as e: except Exception as e:
print(f"⚠️ Repository creation: {e}") print(f"⚠️ Repository creation failed: {e}")
print("💡 Please create the repository manually on Hugging Face Hub first")
# Upload all parquet files print(f"💡 Or change HF_DATASET_NAME to use your own username")
for i in range(num_shards): print("💡 Skipping upload due to repository creation failure")
shard_path = os.path.join(output_dir, f"confirmed_shard_{i:02}.parquet")
if os.path.exists(shard_path):
api.upload_file(
path_or_fileobj=shard_path,
path_in_repo=f"confirmed_shard_{i:02}.parquet",
repo_id=HF_DATASET_NAME,
repo_type="dataset"
)
print(f"📤 Uploaded shard {i+1}/{num_shards}")
if not repo_created:
print("💡 Skipping upload due to repository creation failure")
else:
# Create dataset info file # Create dataset info file
dataset_info = { dataset_info = {
"dataset_name": HF_DATASET_NAME, "dataset_name": HF_DATASET_NAME,
@@ -166,25 +162,24 @@ if confirmed:
"processing": "Vosk API batch confirmation" "processing": "Vosk API batch confirmation"
} }
# Upload dataset info # Save dataset info to output directory
import tempfile info_path = os.path.join(output_dir, "dataset_info.json")
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: with open(info_path, 'w', encoding='utf-8') as f:
json.dump(dataset_info, f, indent=2, ensure_ascii=False) json.dump(dataset_info, f, indent=2, ensure_ascii=False)
info_path = f.name
api.upload_file( # Upload entire folder using upload_folder
path_or_fileobj=info_path, api.upload_folder(
path_in_repo="dataset_info.json", folder_path=output_dir,
repo_id=HF_DATASET_NAME, repo_id=HF_DATASET_NAME,
repo_type="dataset" repo_type="dataset",
) )
os.unlink(info_path)
print(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}") print(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}")
except Exception as e: except Exception as e:
print(f"❌ Failed to push to Hugging Face: {e}") print(f"❌ Failed to push to Hugging Face: {e}")
print("💡 Make sure you're logged in with: huggingface-cli login") print("💡 Make sure you have HF_TOKEN environment variable set")
print("💡 Set it with: export HF_TOKEN=your_token_here")
else: else:
print("❌ No confirmed samples to save") print("❌ No confirmed samples to save")

View File

@@ -8,14 +8,14 @@ RUN apt-get update && apt-get install -y \
WORKDIR /app WORKDIR /app
# Install Python dependencies # Install Python dependencies
COPY requirements.txt ./ COPY vosk_service/requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
# Copy service code # Copy service code
COPY app.py ./ COPY vosk_service/app.py ./
# Copy model directory # Copy model directory
COPY model/ ./model/ COPY vosk-model-fa-0.42/ ./model/
EXPOSE 5000 EXPOSE 5000