This commit is contained in:
Alireza
2025-07-31 17:35:08 +03:30
commit 640363fef2
27 changed files with 4201 additions and 0 deletions

View File

@@ -0,0 +1,190 @@
from datasets import load_dataset, Audio, Dataset
import soundfile as sf
import requests
import os
from tqdm import tqdm
import pandas as pd
import json
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
from huggingface_hub import HfApi, create_repo
# Load the dataset with audio decoding
print("Loading dataset...")
ds = load_dataset(
"Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
split="validated[:500]",
streaming=False
).cast_column("audio", Audio(sampling_rate=16000))
output_dir = "confirmed_dataset"
os.makedirs(output_dir, exist_ok=True)
confirmed = []
API_URL = "http://localhost:5000/batch_confirm"
batch_size = 8
# Hugging Face configuration
HF_DATASET_NAME = "dpr2000/persian-cv17-confirmed" # Change this to your desired dataset name
HF_PRIVATE = True # Set to True for private dataset
def save_flac(audio_array, path):
sf.write(path, audio_array, 16000, format="FLAC")
print("Processing batches...")
for i in tqdm(range(0, len(ds), batch_size)):
batch = ds[i:i+batch_size]
files = {}
references = []
temp_flacs = []
audio_arrays = []
# Fix: batch is a dict of lists
for j in range(len(batch["audio"])):
audio = batch["audio"][j]
flac_path = f"temp_{i+j}.flac"
save_flac(audio["array"], flac_path)
files[f"audio{j}"] = open(flac_path, "rb")
references.append(batch["sentence"][j])
temp_flacs.append(flac_path)
audio_arrays.append(audio["array"]) # Store the array for confirmed
data = {"references": json.dumps(references)}
try:
response = requests.post(API_URL, files=files, data=data, timeout=120)
if response.status_code == 200:
resp_json = response.json()
if "results" in resp_json:
results = resp_json["results"]
else:
print(f"Batch {i} failed: 'results' key missing in response: {resp_json}")
results = [None] * len(references)
else:
print(f"Batch {i} failed: HTTP {response.status_code} - {response.text}")
results = [None] * len(references)
except Exception as e:
print(f"Batch {i} failed: {e}")
results = [None] * len(references)
for j, result in enumerate(results):
if result and result.get("confirmed"):
# Save confirmed audio array and transcription
confirmed.append({"audio": audio_arrays[j], "transcription": references[j]})
os.remove(temp_flacs[j])
else:
os.remove(temp_flacs[j])
for f in files.values():
f.close()
# Save confirmed data using sharding approach
if confirmed:
print(f"\n🔄 Saving {len(confirmed)} confirmed samples...")
# Convert confirmed data to HuggingFace dataset format
def extract_minimal(example):
# Convert float32 audio (range -1.0 to 1.0) to int16 (range -32768 to 32767)
audio_float32 = np.array(example["audio"], dtype=np.float32)
# Ensure audio is in valid range and scale to int16
audio_float32 = np.clip(audio_float32, -1.0, 1.0)
audio_int16 = (audio_float32 * 32767).astype(np.int16)
return {
"audio": audio_int16.tobytes(), # Store as int16 bytes, compatible with Whisper
"text": example["transcription"]
}
# Create dataset from confirmed samples
confirmed_dataset = Dataset.from_list(confirmed)
confirmed_dataset = confirmed_dataset.map(extract_minimal, remove_columns=confirmed_dataset.column_names)
# Sharding parameters
num_shards = min(1, len(confirmed)) # Don't create more shards than samples
shard_size = len(confirmed_dataset) // num_shards + 1
# Write each shard separately
for i in range(num_shards):
start = i * shard_size
end = min(len(confirmed_dataset), (i + 1) * shard_size)
if start >= len(confirmed_dataset):
break
shard = confirmed_dataset.select(range(start, end))
table = pa.Table.from_pandas(shard.to_pandas()) # Convert to PyArrow table
shard_path = os.path.join(output_dir, f"confirmed_shard_{i:02}.parquet")
pq.write_table(
table,
shard_path,
compression="zstd",
compression_level=22, # Maximum compression
use_dictionary=True,
version="2.6"
)
print(f"🔹 Shard {i+1}/{num_shards}: {len(shard)} samples saved")
print(f"\n✅ All confirmed data saved in {num_shards} shards in `{output_dir}/`")
# Push to Hugging Face Hub
print(f"\n🚀 Pushing dataset to Hugging Face Hub as '{HF_DATASET_NAME}'...")
try:
# Initialize HF API
api = HfApi()
# Create the repository (private if specified)
try:
create_repo(
repo_id=HF_DATASET_NAME,
repo_type="dataset",
private=HF_PRIVATE,
exist_ok=True
)
print(f"✅ Repository '{HF_DATASET_NAME}' created/verified")
except Exception as e:
print(f"⚠️ Repository creation: {e}")
# Upload all parquet files
for i in range(num_shards):
shard_path = os.path.join(output_dir, f"confirmed_shard_{i:02}.parquet")
if os.path.exists(shard_path):
api.upload_file(
path_or_fileobj=shard_path,
path_in_repo=f"confirmed_shard_{i:02}.parquet",
repo_id=HF_DATASET_NAME,
repo_type="dataset"
)
print(f"📤 Uploaded shard {i+1}/{num_shards}")
# Create dataset info file
dataset_info = {
"dataset_name": HF_DATASET_NAME,
"description": "Persian Common Voice confirmed samples for Whisper fine-tuning",
"total_samples": len(confirmed),
"num_shards": num_shards,
"audio_format": "int16 PCM, 16kHz",
"columns": ["audio", "text"],
"source_dataset": "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
"processing": "Vosk API batch confirmation"
}
# Upload dataset info
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
json.dump(dataset_info, f, indent=2, ensure_ascii=False)
info_path = f.name
api.upload_file(
path_or_fileobj=info_path,
path_in_repo="dataset_info.json",
repo_id=HF_DATASET_NAME,
repo_type="dataset"
)
os.unlink(info_path)
print(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}")
except Exception as e:
print(f"❌ Failed to push to Hugging Face: {e}")
print("💡 Make sure you're logged in with: huggingface-cli login")
else:
print("❌ No confirmed samples to save")

View File

@@ -0,0 +1,52 @@
import requests
import json
import soundfile as sf
import numpy as np
import os
# Test the API connection
API_URL = "http://localhost:5000/batch_confirm"
def test_api():
print("Testing API connection...")
try:
response = requests.get("http://localhost:5000/")
print(f"API health check: {response.status_code}")
print(f"Response: {response.json()}")
except Exception as e:
print(f"API not reachable: {e}")
return False
return True
def test_batch_confirm():
print("\nTesting batch confirm...")
# Create a simple test audio file
test_audio = np.random.randn(16000).astype(np.float32) # 1 second of noise
test_path = "test_audio.flac"
sf.write(test_path, test_audio, 16000, format="FLAC")
# Test batch confirm
with open(test_path, "rb") as f:
files = {"audio0": f}
data = {"references": json.dumps(["test sentence"])}
try:
response = requests.post(API_URL, files=files, data=data, timeout=30)
print(f"Batch confirm response: {response.status_code}")
if response.status_code == 200:
print(f"Response JSON: {response.json()}")
else:
print(f"Error: {response.text}")
except Exception as e:
print(f"Batch confirm failed: {e}")
# Clean up
if os.path.exists(test_path):
os.remove(test_path)
if __name__ == "__main__":
if test_api():
test_batch_confirm()
else:
print("Please start the Vosk API first!")

View File

@@ -0,0 +1,33 @@
import os
import requests
import zipfile
MODEL_URL = "https://alphacephei.com/vosk/models/vosk-model-fa-0.42.zip"
MODEL_ZIP = "vosk-model-fa-0.42.zip"
MODEL_DIR = "vosk-model-fa-0.42"
# Download the model zip if not present
if not os.path.exists(MODEL_ZIP):
print(f"Downloading {MODEL_URL} ...")
with requests.get(MODEL_URL, stream=True) as r:
r.raise_for_status()
total = int(r.headers.get('content-length', 0))
with open(MODEL_ZIP, 'wb') as f:
downloaded = 0
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
print(f"\rDownloaded {downloaded/1024/1024:.2f} MB / {total/1024/1024:.2f} MB", end='', flush=True)
print("\nDownload complete.")
else:
print(f"{MODEL_ZIP} already exists.")
# Extract the model zip if not already extracted
if not os.path.exists(MODEL_DIR):
print(f"Extracting {MODEL_ZIP} ...")
with zipfile.ZipFile(MODEL_ZIP, 'r') as zip_ref:
zip_ref.extractall()
print(f"Extracted to {MODEL_DIR}.")
else:
print(f"{MODEL_DIR} already extracted.")

View File

@@ -0,0 +1,89 @@
import sys
import os
import pandas as pd
import numpy as np
import sounddevice as sd
from PyQt5.QtWidgets import (
QApplication, QWidget, QLabel, QPushButton, QVBoxLayout, QHBoxLayout, QMessageBox
)
parquet_path = os.path.join('confirmed_dataset', 'confirmed_shard_00.parquet')
df = pd.read_parquet(parquet_path)
results = []
class AudioReviewer(QWidget):
def __init__(self, df):
super().__init__()
self.df = df
self.idx = 0
self.total = len(df)
self.audio = None
self.transcription = None
self.setWindowTitle("Human Audio Confirmation GUI (PyQt5)")
self.setGeometry(100, 100, 600, 200)
self.label = QLabel(f"Sample 1/{self.total}", self)
self.trans_label = QLabel("", self)
self.play_button = QPushButton("Play Audio", self)
self.yes_button = QPushButton("Yes (Correct)", self)
self.no_button = QPushButton("No (Incorrect)", self)
self.skip_button = QPushButton("Skip", self)
self.quit_button = QPushButton("Quit", self)
self.play_button.clicked.connect(self.play_audio)
self.yes_button.clicked.connect(lambda: self.save_and_next('y'))
self.no_button.clicked.connect(lambda: self.save_and_next('n'))
self.skip_button.clicked.connect(lambda: self.save_and_next('skip'))
self.quit_button.clicked.connect(self.quit)
vbox = QVBoxLayout()
vbox.addWidget(self.label)
vbox.addWidget(self.trans_label)
vbox.addWidget(self.play_button)
hbox = QHBoxLayout()
hbox.addWidget(self.yes_button)
hbox.addWidget(self.no_button)
hbox.addWidget(self.skip_button)
hbox.addWidget(self.quit_button)
vbox.addLayout(hbox)
self.setLayout(vbox)
self.load_sample()
def load_sample(self):
if self.idx >= self.total:
QMessageBox.information(self, "Done", "All samples reviewed!")
self.quit()
return
row = self.df.iloc[self.idx]
# Convert bytes back to numpy array
audio_bytes = row['audio']
self.audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0 # Convert int16 to float32
self.transcription = row['text'] # Use 'text' column instead of 'transcription'
self.label.setText(f"Sample {self.idx+1}/{self.total}")
self.trans_label.setText(f"Transcription: {self.transcription}")
def play_audio(self):
sd.play(self.audio, 16000)
sd.wait()
def save_and_next(self, result):
results.append({
'index': self.idx,
'transcription': self.transcription,
'result': result
})
self.idx += 1
self.load_sample()
def quit(self):
pd.DataFrame(results).to_csv('human_confirmed_results.csv', index=False)
self.close()
if __name__ == "__main__":
app = QApplication(sys.argv)
reviewer = AudioReviewer(df)
reviewer.show()
sys.exit(app.exec_())

View File

@@ -0,0 +1,39 @@
import requests
import difflib
import sys
# Usage: python test_vosk_transcription.py <audio_file> <reference_text>
API_URL = 'http://localhost:5000/transcribe'
def similarity(a, b):
return difflib.SequenceMatcher(None, a, b).ratio()
def main():
if len(sys.argv) != 3:
print("Usage: python test_vosk_transcription.py <audio_file> <reference_text>")
sys.exit(1)
audio_path = sys.argv[1]
reference_text = sys.argv[2]
with open(audio_path, 'rb') as f:
files = {'audio': f}
response = requests.post(API_URL, files=files)
if response.status_code != 200:
print(f"API error: {response.text}")
sys.exit(1)
transcription = response.json().get('transcription', '')
sim = similarity(transcription, reference_text)
print(f"Transcription: {transcription}")
print(f"Reference: {reference_text}")
print(f"Similarity: {sim:.2f}")
if sim > 0.2:
print("Test PASSED: Similarity above threshold.")
sys.exit(0)
else:
print("Test FAILED: Similarity below threshold.")
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,22 @@
FROM python:3.10-slim
# Install dependencies
RUN apt-get update && apt-get install -y \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Install Python dependencies
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
# Copy service code
COPY app.py ./
# Copy model directory
COPY model/ ./model/
EXPOSE 5000
CMD ["python", "app.py"]

View File

@@ -0,0 +1,26 @@
# Vosk Speech-to-Text Docker Service
## Setup
1. Download and extract a Vosk model (already downloading `vosk-model-small-en-us-0.15.zip`):
```sh
unzip model.zip -d model
mv model/* model/
```
2. Build the Docker image:
```sh
docker build -t vosk-api .
```
3. Run the Docker container (mounting the model directory):
```sh
docker run -p 5000:5000 -v $(pwd)/model:/app/model vosk-api
```
## API Usage
POST `/transcribe` with form-data key `audio` (WAV/FLAC/OGG file). Returns JSON with `transcription`.

108
vosk/vosk_service/app.py Normal file
View File

@@ -0,0 +1,108 @@
from flask import Flask, request, jsonify
from vosk import Model, KaldiRecognizer
import soundfile as sf
import io
import os
import json
import numpy as np
from multiprocessing import Process, Queue
import difflib
app = Flask(__name__)
MODEL_PATH = "/app/model"
# Check if model exists and load it
print(f"Checking for model at: {MODEL_PATH}")
if os.path.exists(MODEL_PATH):
print(f"Model directory exists at {MODEL_PATH}")
print(f"Contents: {os.listdir(MODEL_PATH)}")
try:
model = Model(MODEL_PATH)
print("Model loaded successfully!")
except Exception as e:
print(f"Error loading model: {e}")
raise RuntimeError(f"Failed to load Vosk model: {e}")
else:
print(f"Model directory not found at {MODEL_PATH}")
raise RuntimeError(f"Vosk model not found at {MODEL_PATH}. Please download and mount a model.")
def similarity(a, b):
return difflib.SequenceMatcher(None, a, b).ratio()
def confirm_voice(audio_bytes, reference_text, samplerate, queue):
data, _ = sf.read(io.BytesIO(audio_bytes))
if len(data.shape) > 1:
data = data[:, 0]
if data.dtype != np.int16:
data = (data * 32767).astype(np.int16)
recognizer = KaldiRecognizer(model, samplerate)
recognizer.AcceptWaveform(data.tobytes())
result = recognizer.Result()
text = json.loads(result).get('text', '')
sim = similarity(text, reference_text)
queue.put({'transcription': text, 'similarity': sim, 'confirmed': sim > 0.2})
@app.route('/', methods=['GET'])
def health_check():
return jsonify({'status': 'ok', 'service': 'vosk-transcription-api', 'model': 'persian'})
@app.route('/batch_confirm', methods=['POST'])
def batch_confirm():
# Expecting a multipart/form-data with multiple audio files and a JSON list of references
# audio files: audio0, audio1, ...
# references: JSON list in 'references' field
references = request.form.get('references')
if not references:
return jsonify({'error': 'Missing references'}), 400
try:
references = json.loads(references)
except Exception:
return jsonify({'error': 'Invalid references JSON'}), 400
audio_files = []
for i in range(len(references)):
audio_file = request.files.get(f'audio{i}')
if not audio_file:
return jsonify({'error': f'Missing audio file audio{i}'}), 400
audio_files.append(audio_file.read())
results = []
processes = []
queues = []
# Get sample rates for each audio
samplerates = []
for audio_bytes in audio_files:
data, samplerate = sf.read(io.BytesIO(audio_bytes))
samplerates.append(samplerate)
for idx, (audio_bytes, reference_text, samplerate) in enumerate(zip(audio_files, references, samplerates)):
queue = Queue()
p = Process(target=confirm_voice, args=(audio_bytes, reference_text, samplerate, queue))
processes.append(p)
queues.append(queue)
p.start()
for p in processes:
p.join()
for queue in queues:
results.append(queue.get())
return jsonify({'results': results})
@app.route('/transcribe', methods=['POST'])
def transcribe():
if 'audio' not in request.files:
return jsonify({'error': 'No audio file provided'}), 400
audio_file = request.files['audio']
audio_bytes = audio_file.read()
data, samplerate = sf.read(io.BytesIO(audio_bytes))
if len(data.shape) > 1:
data = data[:, 0] # Use first channel if stereo
# Convert to 16-bit PCM
if data.dtype != np.int16:
data = (data * 32767).astype(np.int16)
recognizer = KaldiRecognizer(model, samplerate)
recognizer.AcceptWaveform(data.tobytes())
result = recognizer.Result()
print(result) # For debugging
text = json.loads(result).get('text', '')
return jsonify({'transcription': text})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)

View File

@@ -0,0 +1,3 @@
vosk
Flask
soundfile