Sure! Pl

2025-07-31 17:35:08 +03:30
commit 640363fef2
27 changed files with 4201 additions and 0 deletions
--- a/vosk/test_files/batch_confirm_hf.py
+++ b/vosk/test_files/batch_confirm_hf.py
@@ -0,0 +1,190 @@
+from datasets import load_dataset, Audio, Dataset
+import soundfile as sf
+import requests
+import os
+from tqdm import tqdm
+import pandas as pd
+import json
+import pyarrow as pa
+import pyarrow.parquet as pq
+import numpy as np
+from huggingface_hub import HfApi, create_repo
+
+# Load the dataset with audio decoding
+print("Loading dataset...")
+ds = load_dataset(
+    "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
+    split="validated[:500]",
+    streaming=False
+).cast_column("audio", Audio(sampling_rate=16000))
+
+output_dir = "confirmed_dataset"
+os.makedirs(output_dir, exist_ok=True)
+confirmed = []
+
+API_URL = "http://localhost:5000/batch_confirm"
+batch_size = 8
+
+# Hugging Face configuration
+HF_DATASET_NAME = "dpr2000/persian-cv17-confirmed"  # Change this to your desired dataset name
+HF_PRIVATE = True  # Set to True for private dataset
+
+def save_flac(audio_array, path):
+    sf.write(path, audio_array, 16000, format="FLAC")
+
+print("Processing batches...")
+for i in tqdm(range(0, len(ds), batch_size)):
+    batch = ds[i:i+batch_size]
+    files = {}
+    references = []
+    temp_flacs = []
+    audio_arrays = []
+    # Fix: batch is a dict of lists
+    for j in range(len(batch["audio"])):
+        audio = batch["audio"][j]
+        flac_path = f"temp_{i+j}.flac"
+        save_flac(audio["array"], flac_path)
+        files[f"audio{j}"] = open(flac_path, "rb")
+        references.append(batch["sentence"][j])
+        temp_flacs.append(flac_path)
+        audio_arrays.append(audio["array"])  # Store the array for confirmed
+    data = {"references": json.dumps(references)}
+    try:
+        response = requests.post(API_URL, files=files, data=data, timeout=120)
+        if response.status_code == 200:
+            resp_json = response.json()
+            if "results" in resp_json:
+                results = resp_json["results"]
+            else:
+                print(f"Batch {i} failed: 'results' key missing in response: {resp_json}")
+                results = [None] * len(references)
+        else:
+            print(f"Batch {i} failed: HTTP {response.status_code} - {response.text}")
+            results = [None] * len(references)
+    except Exception as e:
+        print(f"Batch {i} failed: {e}")
+        results = [None] * len(references)
+    for j, result in enumerate(results):
+        if result and result.get("confirmed"):
+            # Save confirmed audio array and transcription
+            confirmed.append({"audio": audio_arrays[j], "transcription": references[j]})
+            os.remove(temp_flacs[j])
+        else:
+            os.remove(temp_flacs[j])
+    for f in files.values():
+        f.close()
+
+# Save confirmed data using sharding approach
+if confirmed:
+    print(f"\n🔄 Saving {len(confirmed)} confirmed samples...")
+    
+    # Convert confirmed data to HuggingFace dataset format
+    def extract_minimal(example):
+        # Convert float32 audio (range -1.0 to 1.0) to int16 (range -32768 to 32767)
+        audio_float32 = np.array(example["audio"], dtype=np.float32)
+        # Ensure audio is in valid range and scale to int16
+        audio_float32 = np.clip(audio_float32, -1.0, 1.0)
+        audio_int16 = (audio_float32 * 32767).astype(np.int16)
+        return {
+            "audio": audio_int16.tobytes(),  # Store as int16 bytes, compatible with Whisper
+            "text": example["transcription"]
+        }
+    
+    # Create dataset from confirmed samples
+    confirmed_dataset = Dataset.from_list(confirmed)
+    confirmed_dataset = confirmed_dataset.map(extract_minimal, remove_columns=confirmed_dataset.column_names)
+    
+    # Sharding parameters
+    num_shards = min(1, len(confirmed))  # Don't create more shards than samples
+    shard_size = len(confirmed_dataset) // num_shards + 1
+    
+    # Write each shard separately
+    for i in range(num_shards):
+        start = i * shard_size
+        end = min(len(confirmed_dataset), (i + 1) * shard_size)
+        
+        if start >= len(confirmed_dataset):
+            break
+            
+        shard = confirmed_dataset.select(range(start, end))
+        table = pa.Table.from_pandas(shard.to_pandas())  # Convert to PyArrow table
+        
+        shard_path = os.path.join(output_dir, f"confirmed_shard_{i:02}.parquet")
+        
+        pq.write_table(
+            table,
+            shard_path,
+            compression="zstd",
+            compression_level=22,     # Maximum compression
+            use_dictionary=True,
+            version="2.6"
+        )
+        
+        print(f"🔹 Shard {i+1}/{num_shards}: {len(shard)} samples saved")
+    
+    print(f"\n✅ All confirmed data saved in {num_shards} shards in `{output_dir}/`")
+    
+    # Push to Hugging Face Hub
+    print(f"\n🚀 Pushing dataset to Hugging Face Hub as '{HF_DATASET_NAME}'...")
+    try:
+        # Initialize HF API
+        api = HfApi()
+        
+        # Create the repository (private if specified)
+        try:
+            create_repo(
+                repo_id=HF_DATASET_NAME,
+                repo_type="dataset",
+                private=HF_PRIVATE,
+                exist_ok=True
+            )
+            print(f"✅ Repository '{HF_DATASET_NAME}' created/verified")
+        except Exception as e:
+            print(f"⚠️  Repository creation: {e}")
+        
+        # Upload all parquet files
+        for i in range(num_shards):
+            shard_path = os.path.join(output_dir, f"confirmed_shard_{i:02}.parquet")
+            if os.path.exists(shard_path):
+                api.upload_file(
+                    path_or_fileobj=shard_path,
+                    path_in_repo=f"confirmed_shard_{i:02}.parquet",
+                    repo_id=HF_DATASET_NAME,
+                    repo_type="dataset"
+                )
+                print(f"📤 Uploaded shard {i+1}/{num_shards}")
+        
+        # Create dataset info file
+        dataset_info = {
+            "dataset_name": HF_DATASET_NAME,
+            "description": "Persian Common Voice confirmed samples for Whisper fine-tuning",
+            "total_samples": len(confirmed),
+            "num_shards": num_shards,
+            "audio_format": "int16 PCM, 16kHz",
+            "columns": ["audio", "text"],
+            "source_dataset": "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
+            "processing": "Vosk API batch confirmation"
+        }
+        
+        # Upload dataset info
+        import tempfile
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+            json.dump(dataset_info, f, indent=2, ensure_ascii=False)
+            info_path = f.name
+        
+        api.upload_file(
+            path_or_fileobj=info_path,
+            path_in_repo="dataset_info.json",
+            repo_id=HF_DATASET_NAME,
+            repo_type="dataset"
+        )
+        os.unlink(info_path)
+        
+        print(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}")
+        
+    except Exception as e:
+        print(f"❌ Failed to push to Hugging Face: {e}")
+        print("💡 Make sure you're logged in with: huggingface-cli login")
+
+else:
+    print("❌ No confirmed samples to save") 
--- a/vosk/test_files/debug_batch_confirm.py
+++ b/vosk/test_files/debug_batch_confirm.py
@@ -0,0 +1,52 @@
+import requests
+import json
+import soundfile as sf
+import numpy as np
+import os
+
+# Test the API connection
+API_URL = "http://localhost:5000/batch_confirm"
+
+def test_api():
+    print("Testing API connection...")
+    try:
+        response = requests.get("http://localhost:5000/")
+        print(f"API health check: {response.status_code}")
+        print(f"Response: {response.json()}")
+    except Exception as e:
+        print(f"API not reachable: {e}")
+        return False
+    return True
+
+def test_batch_confirm():
+    print("\nTesting batch confirm...")
+    
+    # Create a simple test audio file
+    test_audio = np.random.randn(16000).astype(np.float32)  # 1 second of noise
+    test_path = "test_audio.flac"
+    sf.write(test_path, test_audio, 16000, format="FLAC")
+    
+    # Test batch confirm
+    with open(test_path, "rb") as f:
+        files = {"audio0": f}
+        data = {"references": json.dumps(["test sentence"])}
+        
+        try:
+            response = requests.post(API_URL, files=files, data=data, timeout=30)
+            print(f"Batch confirm response: {response.status_code}")
+            if response.status_code == 200:
+                print(f"Response JSON: {response.json()}")
+            else:
+                print(f"Error: {response.text}")
+        except Exception as e:
+            print(f"Batch confirm failed: {e}")
+    
+    # Clean up
+    if os.path.exists(test_path):
+        os.remove(test_path)
+
+if __name__ == "__main__":
+    if test_api():
+        test_batch_confirm()
+    else:
+        print("Please start the Vosk API first!") 
--- a/vosk/test_files/download_large_persian_model.py
+++ b/vosk/test_files/download_large_persian_model.py
@@ -0,0 +1,33 @@
+import os
+import requests
+import zipfile
+
+MODEL_URL = "https://alphacephei.com/vosk/models/vosk-model-fa-0.42.zip"
+MODEL_ZIP = "vosk-model-fa-0.42.zip"
+MODEL_DIR = "vosk-model-fa-0.42"
+
+# Download the model zip if not present
+if not os.path.exists(MODEL_ZIP):
+    print(f"Downloading {MODEL_URL} ...")
+    with requests.get(MODEL_URL, stream=True) as r:
+        r.raise_for_status()
+        total = int(r.headers.get('content-length', 0))
+        with open(MODEL_ZIP, 'wb') as f:
+            downloaded = 0
+            for chunk in r.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+                    downloaded += len(chunk)
+                    print(f"\rDownloaded {downloaded/1024/1024:.2f} MB / {total/1024/1024:.2f} MB", end='', flush=True)
+    print("\nDownload complete.")
+else:
+    print(f"{MODEL_ZIP} already exists.")
+
+# Extract the model zip if not already extracted
+if not os.path.exists(MODEL_DIR):
+    print(f"Extracting {MODEL_ZIP} ...")
+    with zipfile.ZipFile(MODEL_ZIP, 'r') as zip_ref:
+        zip_ref.extractall()
+    print(f"Extracted to {MODEL_DIR}.")
+else:
+    print(f"{MODEL_DIR} already extracted.") 
--- a/vosk/test_files/human_confirm_parquet.py
+++ b/vosk/test_files/human_confirm_parquet.py
@@ -0,0 +1,89 @@
+import sys
+import os
+import pandas as pd
+import numpy as np
+import sounddevice as sd
+from PyQt5.QtWidgets import (
+    QApplication, QWidget, QLabel, QPushButton, QVBoxLayout, QHBoxLayout, QMessageBox
+)
+
+parquet_path = os.path.join('confirmed_dataset', 'confirmed_shard_00.parquet')
+df = pd.read_parquet(parquet_path)
+results = []
+
+class AudioReviewer(QWidget):
+    def __init__(self, df):
+        super().__init__()
+        self.df = df
+        self.idx = 0
+        self.total = len(df)
+        self.audio = None
+        self.transcription = None
+
+        self.setWindowTitle("Human Audio Confirmation GUI (PyQt5)")
+        self.setGeometry(100, 100, 600, 200)
+
+        self.label = QLabel(f"Sample 1/{self.total}", self)
+        self.trans_label = QLabel("", self)
+        self.play_button = QPushButton("Play Audio", self)
+        self.yes_button = QPushButton("Yes (Correct)", self)
+        self.no_button = QPushButton("No (Incorrect)", self)
+        self.skip_button = QPushButton("Skip", self)
+        self.quit_button = QPushButton("Quit", self)
+
+        self.play_button.clicked.connect(self.play_audio)
+        self.yes_button.clicked.connect(lambda: self.save_and_next('y'))
+        self.no_button.clicked.connect(lambda: self.save_and_next('n'))
+        self.skip_button.clicked.connect(lambda: self.save_and_next('skip'))
+        self.quit_button.clicked.connect(self.quit)
+
+        vbox = QVBoxLayout()
+        vbox.addWidget(self.label)
+        vbox.addWidget(self.trans_label)
+        vbox.addWidget(self.play_button)
+
+        hbox = QHBoxLayout()
+        hbox.addWidget(self.yes_button)
+        hbox.addWidget(self.no_button)
+        hbox.addWidget(self.skip_button)
+        hbox.addWidget(self.quit_button)
+        vbox.addLayout(hbox)
+
+        self.setLayout(vbox)
+        self.load_sample()
+
+    def load_sample(self):
+        if self.idx >= self.total:
+            QMessageBox.information(self, "Done", "All samples reviewed!")
+            self.quit()
+            return
+        row = self.df.iloc[self.idx]
+        # Convert bytes back to numpy array
+        audio_bytes = row['audio']
+        self.audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0  # Convert int16 to float32
+        self.transcription = row['text']  # Use 'text' column instead of 'transcription'
+        self.label.setText(f"Sample {self.idx+1}/{self.total}")
+        self.trans_label.setText(f"Transcription: {self.transcription}")
+
+    def play_audio(self):
+        sd.play(self.audio, 16000)
+        sd.wait()
+
+    def save_and_next(self, result):
+        results.append({
+            'index': self.idx,
+            'transcription': self.transcription,
+            'result': result
+        })
+        self.idx += 1
+        self.load_sample()
+
+    def quit(self):
+        pd.DataFrame(results).to_csv('human_confirmed_results.csv', index=False)
+        self.close()
+
+if __name__ == "__main__":
+    app = QApplication(sys.argv)
+    reviewer = AudioReviewer(df)
+    reviewer.show()
+    sys.exit(app.exec_()) 
--- a/vosk/test_files/test_vosk_transcription.py
+++ b/vosk/test_files/test_vosk_transcription.py
@@ -0,0 +1,39 @@
+import requests
+import difflib
+import sys
+
+# Usage: python test_vosk_transcription.py <audio_file> <reference_text>
+
+API_URL = 'http://localhost:5000/transcribe'
+
+
+def similarity(a, b):
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: python test_vosk_transcription.py <audio_file> <reference_text>")
+        sys.exit(1)
+    audio_path = sys.argv[1]
+    reference_text = sys.argv[2]
+    with open(audio_path, 'rb') as f:
+        files = {'audio': f}
+        response = requests.post(API_URL, files=files)
+    if response.status_code != 200:
+        print(f"API error: {response.text}")
+        sys.exit(1)
+    transcription = response.json().get('transcription', '')
+    sim = similarity(transcription, reference_text)
+    print(f"Transcription: {transcription}")
+    print(f"Reference: {reference_text}")
+    print(f"Similarity: {sim:.2f}")
+    if sim > 0.2:
+        print("Test PASSED: Similarity above threshold.")
+        sys.exit(0)
+    else:
+        print("Test FAILED: Similarity below threshold.")
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main() 
--- a/vosk/vosk_service/Dockerfile
+++ b/vosk/vosk_service/Dockerfile
@@ -0,0 +1,22 @@
+FROM python:3.10-slim
+
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Install Python dependencies
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy service code
+COPY app.py ./
+
+# Copy model directory
+COPY model/ ./model/
+
+EXPOSE 5000
+
+CMD ["python", "app.py"] 
--- a/vosk/vosk_service/README.md
+++ b/vosk/vosk_service/README.md
@@ -0,0 +1,26 @@
+# Vosk Speech-to-Text Docker Service
+
+## Setup
+
+1. Download and extract a Vosk model (already downloading `vosk-model-small-en-us-0.15.zip`):
+
+```sh
+unzip model.zip -d model
+mv model/* model/
+```
+
+2. Build the Docker image:
+
+```sh
+docker build -t vosk-api .
+```
+
+3. Run the Docker container (mounting the model directory):
+
+```sh
+docker run -p 5000:5000 -v $(pwd)/model:/app/model vosk-api
+```
+
+## API Usage
+
+POST `/transcribe` with form-data key `audio` (WAV/FLAC/OGG file). Returns JSON with `transcription`. 
--- a/vosk/vosk_service/app.py
+++ b/vosk/vosk_service/app.py
@@ -0,0 +1,108 @@
+from flask import Flask, request, jsonify
+from vosk import Model, KaldiRecognizer
+import soundfile as sf
+import io
+import os
+import json
+import numpy as np
+from multiprocessing import Process, Queue
+import difflib
+
+app = Flask(__name__)
+
+MODEL_PATH = "/app/model"
+
+# Check if model exists and load it
+print(f"Checking for model at: {MODEL_PATH}")
+if os.path.exists(MODEL_PATH):
+    print(f"Model directory exists at {MODEL_PATH}")
+    print(f"Contents: {os.listdir(MODEL_PATH)}")
+    try:
+        model = Model(MODEL_PATH)
+        print("Model loaded successfully!")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        raise RuntimeError(f"Failed to load Vosk model: {e}")
+else:
+    print(f"Model directory not found at {MODEL_PATH}")
+    raise RuntimeError(f"Vosk model not found at {MODEL_PATH}. Please download and mount a model.")
+
+def similarity(a, b):
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+def confirm_voice(audio_bytes, reference_text, samplerate, queue):
+    data, _ = sf.read(io.BytesIO(audio_bytes))
+    if len(data.shape) > 1:
+        data = data[:, 0]
+    if data.dtype != np.int16:
+        data = (data * 32767).astype(np.int16)
+    recognizer = KaldiRecognizer(model, samplerate)
+    recognizer.AcceptWaveform(data.tobytes())
+    result = recognizer.Result()
+    text = json.loads(result).get('text', '')
+    sim = similarity(text, reference_text)
+    queue.put({'transcription': text, 'similarity': sim, 'confirmed': sim > 0.2})
+
+@app.route('/', methods=['GET'])
+def health_check():
+    return jsonify({'status': 'ok', 'service': 'vosk-transcription-api', 'model': 'persian'})
+
+@app.route('/batch_confirm', methods=['POST'])
+def batch_confirm():
+    # Expecting a multipart/form-data with multiple audio files and a JSON list of references
+    # audio files: audio0, audio1, ...
+    # references: JSON list in 'references' field
+    references = request.form.get('references')
+    if not references:
+        return jsonify({'error': 'Missing references'}), 400
+    try:
+        references = json.loads(references)
+    except Exception:
+        return jsonify({'error': 'Invalid references JSON'}), 400
+    audio_files = []
+    for i in range(len(references)):
+        audio_file = request.files.get(f'audio{i}')
+        if not audio_file:
+            return jsonify({'error': f'Missing audio file audio{i}'}), 400
+        audio_files.append(audio_file.read())
+    results = []
+    processes = []
+    queues = []
+    # Get sample rates for each audio
+    samplerates = []
+    for audio_bytes in audio_files:
+        data, samplerate = sf.read(io.BytesIO(audio_bytes))
+        samplerates.append(samplerate)
+    for idx, (audio_bytes, reference_text, samplerate) in enumerate(zip(audio_files, references, samplerates)):
+        queue = Queue()
+        p = Process(target=confirm_voice, args=(audio_bytes, reference_text, samplerate, queue))
+        processes.append(p)
+        queues.append(queue)
+        p.start()
+    for p in processes:
+        p.join()
+    for queue in queues:
+        results.append(queue.get())
+    return jsonify({'results': results})
+
+@app.route('/transcribe', methods=['POST'])
+def transcribe():
+    if 'audio' not in request.files:
+        return jsonify({'error': 'No audio file provided'}), 400
+    audio_file = request.files['audio']
+    audio_bytes = audio_file.read()
+    data, samplerate = sf.read(io.BytesIO(audio_bytes))
+    if len(data.shape) > 1:
+        data = data[:, 0]  # Use first channel if stereo
+    # Convert to 16-bit PCM
+    if data.dtype != np.int16:
+        data = (data * 32767).astype(np.int16)
+    recognizer = KaldiRecognizer(model, samplerate)
+    recognizer.AcceptWaveform(data.tobytes())
+    result = recognizer.Result()
+    print(result)  # For debugging
+    text = json.loads(result).get('text', '')
+    return jsonify({'transcription': text})
+
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=5000) 
--- a/vosk/vosk_service/requirements.txt
+++ b/vosk/vosk_service/requirements.txt
@@ -0,0 +1,3 @@
+vosk
+Flask
+soundfile