Enhance batch_confirm_hf_optimized.py to ensure torchcodec is installed before loading the dataset, and update requirements_optimized.txt to include torchcodec. Modify run_optimized_192cores_no_root.sh to install additional audio dependencies and test audio imports.

2025-08-02 18:33:31 +03:30
parent ab53369a89
commit 561e8b519c
8 changed files with 479 additions and 0 deletions
--- a/vosk/test_files/batch_confirm_hf_simple.py
+++ b/vosk/test_files/batch_confirm_hf_simple.py
@@ -0,0 +1,318 @@
+import asyncio
+import aiohttp
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+import soundfile as sf
+import requests
+import os
+from tqdm import tqdm
+import pandas as pd
+import json
+import pyarrow as pa
+import pyarrow.parquet as pq
+import numpy as np
+from huggingface_hub import HfApi, create_repo
+from datasets import load_dataset, Audio, Dataset
+import time
+from functools import partial
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Configuration for 192 cores
+NUM_CORES = 192
+BATCH_SIZE = 32  # Increased batch size for better throughput
+MAX_CONCURRENT_REQUESTS = 48  # 192/4 for optimal concurrency
+CHUNK_SIZE = 1000  # Process data in chunks to manage memory
+
+# Load the dataset without audio decoding first
+print("Loading dataset...")
+ds = load_dataset(
+    "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
+    split="validated",
+    streaming=False
+)
+
+# Now cast to audio after loading
+print("Casting to audio...")
+ds = ds.cast_column("audio", Audio(sampling_rate=16000))
+
+output_dir = "confirmed_dataset"
+os.makedirs(output_dir, exist_ok=True)
+
+API_URL = "http://localhost:5000/batch_confirm"
+
+# Hugging Face configuration
+HF_DATASET_NAME = "dpr2000/persian-cv17-confirmed"
+HF_PRIVATE = True
+
+def save_flac(audio_array, path):
+    """Save audio array as FLAC file"""
+    sf.write(path, audio_array, 16000, format="FLAC")
+
+def process_audio_chunk(audio_data):
+    """Process a single audio item - designed for multiprocessing"""
+    audio, sentence = audio_data
+    flac_path = f"temp_{hash(audio.tobytes())}.flac"
+    save_flac(audio["array"], flac_path)
+    return {
+        'flac_path': flac_path,
+        'sentence': sentence,
+        'audio_array': audio["array"]
+    }
+
+async def send_batch_request(session, batch_data, batch_id):
+    """Send a single batch request asynchronously"""
+    files = {}
+    references = []
+    temp_flacs = []
+    audio_arrays = []
+    
+    for j, item in enumerate(batch_data):
+        files[f"audio{j}"] = open(item['flac_path'], "rb")
+        references.append(item['sentence'])
+        temp_flacs.append(item['flac_path'])
+        audio_arrays.append(item['audio_array'])
+    
+    data = {"references": json.dumps(references)}
+    
+    try:
+        async with session.post(API_URL, data=data, files=files, timeout=aiohttp.ClientTimeout(total=120)) as response:
+            if response.status == 200:
+                resp_json = await response.json()
+                if "results" in resp_json:
+                    results = resp_json["results"]
+                else:
+                    logger.warning(f"Batch {batch_id} failed: 'results' key missing")
+                    results = [None] * len(references)
+            else:
+                logger.error(f"Batch {batch_id} failed: HTTP {response.status}")
+                results = [None] * len(references)
+    except Exception as e:
+        logger.error(f"Batch {batch_id} failed: {e}")
+        results = [None] * len(references)
+    finally:
+        # Clean up files
+        for f in files.values():
+            f.close()
+        for flac_path in temp_flacs:
+            try:
+                os.remove(flac_path)
+            except:
+                pass
+    
+    # Process results
+    confirmed_items = []
+    for j, result in enumerate(results):
+        if result and result.get("confirmed"):
+            confirmed_items.append({
+                "audio": audio_arrays[j], 
+                "transcription": references[j]
+            })
+    
+    return confirmed_items
+
+async def process_dataset_async():
+    """Main async processing function"""
+    confirmed = []
+    
+    # Prepare all audio data first using multiprocessing
+    print("Preparing audio data with multiprocessing...")
+    audio_data = [(ds[i]["audio"], ds[i]["sentence"]) for i in range(len(ds))]
+    
+    # Use ProcessPoolExecutor for CPU-intensive audio processing
+    with ProcessPoolExecutor(max_workers=NUM_CORES) as executor:
+        processed_audio = list(tqdm(
+            executor.map(process_audio_chunk, audio_data),
+            total=len(audio_data),
+            desc="Processing audio files"
+        ))
+    
+    # Create batches
+    batches = []
+    for i in range(0, len(processed_audio), BATCH_SIZE):
+        batch = processed_audio[i:i+BATCH_SIZE]
+        batches.append((batch, i // BATCH_SIZE))
+    
+    print(f"Processing {len(batches)} batches with {MAX_CONCURRENT_REQUESTS} concurrent requests...")
+    
+    # Process batches asynchronously
+    async with aiohttp.ClientSession(
+        connector=aiohttp.TCPConnector(limit=MAX_CONCURRENT_REQUESTS),
+        timeout=aiohttp.ClientTimeout(total=300)
+    ) as session:
+        tasks = []
+        for batch_data, batch_id in batches:
+            task = send_batch_request(session, batch_data, batch_id)
+            tasks.append(task)
+        
+        # Process in chunks to avoid overwhelming the system
+        chunk_size = MAX_CONCURRENT_REQUESTS
+        for i in range(0, len(tasks), chunk_size):
+            chunk_tasks = tasks[i:i+chunk_size]
+            results = await asyncio.gather(*chunk_tasks, return_exceptions=True)
+            
+            for result in results:
+                if isinstance(result, Exception):
+                    logger.error(f"Task failed: {result}")
+                else:
+                    confirmed.extend(result)
+            
+            print(f"Processed {min(i+chunk_size, len(tasks))}/{len(tasks)} batches, confirmed: {len(confirmed)}")
+    
+    return confirmed
+
+def save_confirmed_data_parallel(confirmed):
+    """Save confirmed data using parallel processing"""
+    if not confirmed:
+        print("❌ No confirmed samples to save")
+        return
+    
+    print(f"\n🔄 Saving {len(confirmed)} confirmed samples...")
+    
+    def extract_minimal(example):
+        """Convert audio to int16 format"""
+        audio_float32 = np.array(example["audio"], dtype=np.float32)
+        audio_float32 = np.clip(audio_float32, -1.0, 1.0)
+        audio_int16 = (audio_float32 * 32767).astype(np.int16)
+        return {
+            "audio": audio_int16.tobytes(),
+            "text": example["transcription"]
+        }
+    
+    # Create dataset from confirmed samples
+    confirmed_dataset = Dataset.from_list(confirmed)
+    confirmed_dataset = confirmed_dataset.map(
+        extract_minimal, 
+        remove_columns=confirmed_dataset.column_names,
+        num_proc=NUM_CORES  # Use all cores for dataset processing
+    )
+    
+    # Optimize sharding for parallel writing
+    num_shards = min(50, len(confirmed))  # More shards for better parallelization
+    shard_size = len(confirmed_dataset) // num_shards + 1
+    
+    def write_shard(shard_info):
+        """Write a single shard - designed for multiprocessing"""
+        i, start, end = shard_info
+        if start >= len(confirmed_dataset):
+            return None
+        
+        shard = confirmed_dataset.select(range(start, end))
+        table = pa.Table.from_pandas(shard.to_pandas())
+        
+        shard_path = os.path.join(output_dir, f"confirmed_shard_{i:03}.parquet")
+        
+        pq.write_table(
+            table,
+            shard_path,
+            compression="zstd",
+            compression_level=22,
+            use_dictionary=True,
+            version="2.6"
+        )
+        
+        return f"Shard {i+1}: {len(shard)} samples saved to {shard_path}"
+    
+    # Prepare shard information
+    shard_info = []
+    for i in range(num_shards):
+        start = i * shard_size
+        end = min(len(confirmed_dataset), (i + 1) * shard_size)
+        shard_info.append((i, start, end))
+    
+    # Write shards in parallel
+    print(f"Writing {num_shards} shards in parallel...")
+    with ProcessPoolExecutor(max_workers=NUM_CORES) as executor:
+        results = list(tqdm(
+            executor.map(write_shard, shard_info),
+            total=len(shard_info),
+            desc="Writing shards"
+        ))
+    
+    # Print results
+    for result in results:
+        if result:
+            print(f"🔹 {result}")
+    
+    print(f"\n✅ All confirmed data saved in {num_shards} shards in `{output_dir}/`")
+    
+    return num_shards
+
+async def upload_to_hf(num_shards):
+    """Upload to Hugging Face Hub"""
+    print(f"\n🚀 Pushing dataset to Hugging Face Hub as '{HF_DATASET_NAME}'...")
+    try:
+        api = HfApi(token=os.getenv("HF_TOKEN"))
+        
+        # Create repository
+        try:
+            create_repo(
+                repo_id=HF_DATASET_NAME,
+                repo_type="dataset",
+                private=HF_PRIVATE,
+                exist_ok=True
+            )
+            print(f"✅ Repository '{HF_DATASET_NAME}' created/verified")
+        except Exception as e:
+            print(f"⚠️  Repository creation failed: {e}")
+            return
+        
+        # Create dataset info
+        dataset_info = {
+            "dataset_name": HF_DATASET_NAME,
+            "description": "Persian Common Voice confirmed samples for Whisper fine-tuning",
+            "total_samples": len(confirmed),
+            "num_shards": num_shards,
+            "audio_format": "int16 PCM, 16kHz",
+            "columns": ["audio", "text"],
+            "source_dataset": "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
+            "processing": "Vosk API batch confirmation (optimized for 192 cores)"
+        }
+        
+        info_path = os.path.join(output_dir, "dataset_info.json")
+        with open(info_path, 'w', encoding='utf-8') as f:
+            json.dump(dataset_info, f, indent=2, ensure_ascii=False)
+        
+        # Upload folder
+        api.upload_folder(
+            folder_path=output_dir,
+            repo_id=HF_DATASET_NAME,
+            repo_type="dataset",
+        )
+        
+        print(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}")
+        
+    except Exception as e:
+        print(f"❌ Failed to push to Hugging Face: {e}")
+
+async def main():
+    """Main function"""
+    start_time = time.time()
+    
+    print(f"🚀 Starting optimized processing with {NUM_CORES} cores")
+    print(f"📊 Dataset size: {len(ds)} samples")
+    print(f"⚙️  Batch size: {BATCH_SIZE}")
+    print(f"🔄 Max concurrent requests: {MAX_CONCURRENT_REQUESTS}")
+    
+    # Process dataset
+    confirmed = await process_dataset_async()
+    
+    # Save data
+    num_shards = save_confirmed_data_parallel(confirmed)
+    
+    # Upload to HF
+    await upload_to_hf(num_shards)
+    
+    end_time = time.time()
+    print(f"\n⏱️  Total processing time: {end_time - start_time:.2f} seconds")
+    print(f"📈 Processing rate: {len(ds) / (end_time - start_time):.2f} samples/second")
+
+if __name__ == "__main__":
+    # Set multiprocessing start method for better performance
+    mp.set_start_method('spawn', force=True)
+    
+    # Run the async main function
+    asyncio.run(main())