From d16f4a84bb61d541c8f3aff5cd3050b4fe4c5c90 Mon Sep 17 00:00:00 2001 From: Alireza Date: Sat, 2 Aug 2025 12:21:28 +0330 Subject: [PATCH] Update .gitignore to include .flac files, remove docker-compose.yml, modify batch_confirm_hf.py for improved error handling and dataset upload logic, and adjust Dockerfile paths for service code and model directory. --- .gitignore | 2 + docker-compose.yml => vosk/docker-compose.yml | 5 +- vosk/test_files/batch_confirm_hf.py | 83 +++++++++---------- vosk/vosk_service/Dockerfile | 6 +- 4 files changed, 47 insertions(+), 49 deletions(-) rename docker-compose.yml => vosk/docker-compose.yml (79%) diff --git a/.gitignore b/.gitignore index 23f21da..2e65e17 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,5 @@ confirmed_dataset/ # Kiro files *.kiro + +*.flac \ No newline at end of file diff --git a/docker-compose.yml b/vosk/docker-compose.yml similarity index 79% rename from docker-compose.yml rename to vosk/docker-compose.yml index b945b69..65757e3 100644 --- a/docker-compose.yml +++ b/vosk/docker-compose.yml @@ -1,7 +1,8 @@ -version: '3.8' services: vosk: - build: ./vosk_service + build: + context: . + dockerfile: vosk_service/Dockerfile container_name: vosk-api ports: - "5000:5000" diff --git a/vosk/test_files/batch_confirm_hf.py b/vosk/test_files/batch_confirm_hf.py index d7b3098..9238fe8 100644 --- a/vosk/test_files/batch_confirm_hf.py +++ b/vosk/test_files/batch_confirm_hf.py @@ -14,7 +14,7 @@ from huggingface_hub import HfApi, create_repo print("Loading dataset...") ds = load_dataset( "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0", - split="validated[:500]", + split="validated[:5]", streaming=False ).cast_column("audio", Audio(sampling_rate=16000)) @@ -127,10 +127,11 @@ if confirmed: # Push to Hugging Face Hub print(f"\nšŸš€ Pushing dataset to Hugging Face Hub as '{HF_DATASET_NAME}'...") try: - # Initialize HF API - api = HfApi() + # Initialize HF API with token + api = HfApi(token=os.getenv("HF_TOKEN")) # Create the repository (private if specified) + repo_created = False try: create_repo( repo_id=HF_DATASET_NAME, @@ -139,52 +140,46 @@ if confirmed: exist_ok=True ) print(f"āœ… Repository '{HF_DATASET_NAME}' created/verified") + repo_created = True except Exception as e: - print(f"āš ļø Repository creation: {e}") + print(f"āš ļø Repository creation failed: {e}") + print("šŸ’” Please create the repository manually on Hugging Face Hub first") + print(f"šŸ’” Or change HF_DATASET_NAME to use your own username") + print("šŸ’” Skipping upload due to repository creation failure") - # Upload all parquet files - for i in range(num_shards): - shard_path = os.path.join(output_dir, f"confirmed_shard_{i:02}.parquet") - if os.path.exists(shard_path): - api.upload_file( - path_or_fileobj=shard_path, - path_in_repo=f"confirmed_shard_{i:02}.parquet", - repo_id=HF_DATASET_NAME, - repo_type="dataset" - ) - print(f"šŸ“¤ Uploaded shard {i+1}/{num_shards}") - - # Create dataset info file - dataset_info = { - "dataset_name": HF_DATASET_NAME, - "description": "Persian Common Voice confirmed samples for Whisper fine-tuning", - "total_samples": len(confirmed), - "num_shards": num_shards, - "audio_format": "int16 PCM, 16kHz", - "columns": ["audio", "text"], - "source_dataset": "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0", - "processing": "Vosk API batch confirmation" - } - - # Upload dataset info - import tempfile - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - json.dump(dataset_info, f, indent=2, ensure_ascii=False) - info_path = f.name - - api.upload_file( - path_or_fileobj=info_path, - path_in_repo="dataset_info.json", - repo_id=HF_DATASET_NAME, - repo_type="dataset" - ) - os.unlink(info_path) - - print(f"šŸŽ‰ Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}") + if not repo_created: + print("šŸ’” Skipping upload due to repository creation failure") + else: + # Create dataset info file + dataset_info = { + "dataset_name": HF_DATASET_NAME, + "description": "Persian Common Voice confirmed samples for Whisper fine-tuning", + "total_samples": len(confirmed), + "num_shards": num_shards, + "audio_format": "int16 PCM, 16kHz", + "columns": ["audio", "text"], + "source_dataset": "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0", + "processing": "Vosk API batch confirmation" + } + + # Save dataset info to output directory + info_path = os.path.join(output_dir, "dataset_info.json") + with open(info_path, 'w', encoding='utf-8') as f: + json.dump(dataset_info, f, indent=2, ensure_ascii=False) + + # Upload entire folder using upload_folder + api.upload_folder( + folder_path=output_dir, + repo_id=HF_DATASET_NAME, + repo_type="dataset", + ) + + print(f"šŸŽ‰ Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}") except Exception as e: print(f"āŒ Failed to push to Hugging Face: {e}") - print("šŸ’” Make sure you're logged in with: huggingface-cli login") + print("šŸ’” Make sure you have HF_TOKEN environment variable set") + print("šŸ’” Set it with: export HF_TOKEN=your_token_here") else: print("āŒ No confirmed samples to save") \ No newline at end of file diff --git a/vosk/vosk_service/Dockerfile b/vosk/vosk_service/Dockerfile index 6861160..55561cf 100644 --- a/vosk/vosk_service/Dockerfile +++ b/vosk/vosk_service/Dockerfile @@ -8,14 +8,14 @@ RUN apt-get update && apt-get install -y \ WORKDIR /app # Install Python dependencies -COPY requirements.txt ./ +COPY vosk_service/requirements.txt ./ RUN pip install --no-cache-dir -r requirements.txt # Copy service code -COPY app.py ./ +COPY vosk_service/app.py ./ # Copy model directory -COPY model/ ./model/ +COPY vosk-model-fa-0.42/ ./model/ EXPOSE 5000