Enhance batch_confirm_hf_optimized.py to ensure torchcodec is installed before loading the dataset, and update requirements_optimized.txt to include torchcodec. Modify run_optimized_192cores_no_root.sh to install additional audio dependencies and test audio imports.

This commit is contained in:
Alireza
2025-08-02 18:33:31 +03:30
parent ab53369a89
commit 561e8b519c
8 changed files with 479 additions and 0 deletions

View File

@@ -27,6 +27,16 @@ BATCH_SIZE = 32 # Increased batch size for better throughput
MAX_CONCURRENT_REQUESTS = 48 # 192/4 for optimal concurrency
CHUNK_SIZE = 1000 # Process data in chunks to manage memory
# Ensure torchcodec is installed before loading dataset
try:
import torchcodec
except ImportError:
print("Installing torchcodec...")
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "torchcodec>=0.1.0"])
import torchcodec
# Load the dataset with audio decoding
print("Loading dataset...")
ds = load_dataset(