Sure! Pl
This commit is contained in:
190
vosk/test_files/batch_confirm_hf.py
Normal file
190
vosk/test_files/batch_confirm_hf.py
Normal file
@@ -0,0 +1,190 @@
|
||||
from datasets import load_dataset, Audio, Dataset
|
||||
import soundfile as sf
|
||||
import requests
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import json
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import numpy as np
|
||||
from huggingface_hub import HfApi, create_repo
|
||||
|
||||
# Load the dataset with audio decoding
|
||||
print("Loading dataset...")
|
||||
ds = load_dataset(
|
||||
"Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
|
||||
split="validated[:500]",
|
||||
streaming=False
|
||||
).cast_column("audio", Audio(sampling_rate=16000))
|
||||
|
||||
output_dir = "confirmed_dataset"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
confirmed = []
|
||||
|
||||
API_URL = "http://localhost:5000/batch_confirm"
|
||||
batch_size = 8
|
||||
|
||||
# Hugging Face configuration
|
||||
HF_DATASET_NAME = "dpr2000/persian-cv17-confirmed" # Change this to your desired dataset name
|
||||
HF_PRIVATE = True # Set to True for private dataset
|
||||
|
||||
def save_flac(audio_array, path):
|
||||
sf.write(path, audio_array, 16000, format="FLAC")
|
||||
|
||||
print("Processing batches...")
|
||||
for i in tqdm(range(0, len(ds), batch_size)):
|
||||
batch = ds[i:i+batch_size]
|
||||
files = {}
|
||||
references = []
|
||||
temp_flacs = []
|
||||
audio_arrays = []
|
||||
# Fix: batch is a dict of lists
|
||||
for j in range(len(batch["audio"])):
|
||||
audio = batch["audio"][j]
|
||||
flac_path = f"temp_{i+j}.flac"
|
||||
save_flac(audio["array"], flac_path)
|
||||
files[f"audio{j}"] = open(flac_path, "rb")
|
||||
references.append(batch["sentence"][j])
|
||||
temp_flacs.append(flac_path)
|
||||
audio_arrays.append(audio["array"]) # Store the array for confirmed
|
||||
data = {"references": json.dumps(references)}
|
||||
try:
|
||||
response = requests.post(API_URL, files=files, data=data, timeout=120)
|
||||
if response.status_code == 200:
|
||||
resp_json = response.json()
|
||||
if "results" in resp_json:
|
||||
results = resp_json["results"]
|
||||
else:
|
||||
print(f"Batch {i} failed: 'results' key missing in response: {resp_json}")
|
||||
results = [None] * len(references)
|
||||
else:
|
||||
print(f"Batch {i} failed: HTTP {response.status_code} - {response.text}")
|
||||
results = [None] * len(references)
|
||||
except Exception as e:
|
||||
print(f"Batch {i} failed: {e}")
|
||||
results = [None] * len(references)
|
||||
for j, result in enumerate(results):
|
||||
if result and result.get("confirmed"):
|
||||
# Save confirmed audio array and transcription
|
||||
confirmed.append({"audio": audio_arrays[j], "transcription": references[j]})
|
||||
os.remove(temp_flacs[j])
|
||||
else:
|
||||
os.remove(temp_flacs[j])
|
||||
for f in files.values():
|
||||
f.close()
|
||||
|
||||
# Save confirmed data using sharding approach
|
||||
if confirmed:
|
||||
print(f"\n🔄 Saving {len(confirmed)} confirmed samples...")
|
||||
|
||||
# Convert confirmed data to HuggingFace dataset format
|
||||
def extract_minimal(example):
|
||||
# Convert float32 audio (range -1.0 to 1.0) to int16 (range -32768 to 32767)
|
||||
audio_float32 = np.array(example["audio"], dtype=np.float32)
|
||||
# Ensure audio is in valid range and scale to int16
|
||||
audio_float32 = np.clip(audio_float32, -1.0, 1.0)
|
||||
audio_int16 = (audio_float32 * 32767).astype(np.int16)
|
||||
return {
|
||||
"audio": audio_int16.tobytes(), # Store as int16 bytes, compatible with Whisper
|
||||
"text": example["transcription"]
|
||||
}
|
||||
|
||||
# Create dataset from confirmed samples
|
||||
confirmed_dataset = Dataset.from_list(confirmed)
|
||||
confirmed_dataset = confirmed_dataset.map(extract_minimal, remove_columns=confirmed_dataset.column_names)
|
||||
|
||||
# Sharding parameters
|
||||
num_shards = min(1, len(confirmed)) # Don't create more shards than samples
|
||||
shard_size = len(confirmed_dataset) // num_shards + 1
|
||||
|
||||
# Write each shard separately
|
||||
for i in range(num_shards):
|
||||
start = i * shard_size
|
||||
end = min(len(confirmed_dataset), (i + 1) * shard_size)
|
||||
|
||||
if start >= len(confirmed_dataset):
|
||||
break
|
||||
|
||||
shard = confirmed_dataset.select(range(start, end))
|
||||
table = pa.Table.from_pandas(shard.to_pandas()) # Convert to PyArrow table
|
||||
|
||||
shard_path = os.path.join(output_dir, f"confirmed_shard_{i:02}.parquet")
|
||||
|
||||
pq.write_table(
|
||||
table,
|
||||
shard_path,
|
||||
compression="zstd",
|
||||
compression_level=22, # Maximum compression
|
||||
use_dictionary=True,
|
||||
version="2.6"
|
||||
)
|
||||
|
||||
print(f"🔹 Shard {i+1}/{num_shards}: {len(shard)} samples saved")
|
||||
|
||||
print(f"\n✅ All confirmed data saved in {num_shards} shards in `{output_dir}/`")
|
||||
|
||||
# Push to Hugging Face Hub
|
||||
print(f"\n🚀 Pushing dataset to Hugging Face Hub as '{HF_DATASET_NAME}'...")
|
||||
try:
|
||||
# Initialize HF API
|
||||
api = HfApi()
|
||||
|
||||
# Create the repository (private if specified)
|
||||
try:
|
||||
create_repo(
|
||||
repo_id=HF_DATASET_NAME,
|
||||
repo_type="dataset",
|
||||
private=HF_PRIVATE,
|
||||
exist_ok=True
|
||||
)
|
||||
print(f"✅ Repository '{HF_DATASET_NAME}' created/verified")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Repository creation: {e}")
|
||||
|
||||
# Upload all parquet files
|
||||
for i in range(num_shards):
|
||||
shard_path = os.path.join(output_dir, f"confirmed_shard_{i:02}.parquet")
|
||||
if os.path.exists(shard_path):
|
||||
api.upload_file(
|
||||
path_or_fileobj=shard_path,
|
||||
path_in_repo=f"confirmed_shard_{i:02}.parquet",
|
||||
repo_id=HF_DATASET_NAME,
|
||||
repo_type="dataset"
|
||||
)
|
||||
print(f"📤 Uploaded shard {i+1}/{num_shards}")
|
||||
|
||||
# Create dataset info file
|
||||
dataset_info = {
|
||||
"dataset_name": HF_DATASET_NAME,
|
||||
"description": "Persian Common Voice confirmed samples for Whisper fine-tuning",
|
||||
"total_samples": len(confirmed),
|
||||
"num_shards": num_shards,
|
||||
"audio_format": "int16 PCM, 16kHz",
|
||||
"columns": ["audio", "text"],
|
||||
"source_dataset": "Ashegh-Sad-Warrior/Persian_Common_Voice_17_0",
|
||||
"processing": "Vosk API batch confirmation"
|
||||
}
|
||||
|
||||
# Upload dataset info
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
|
||||
json.dump(dataset_info, f, indent=2, ensure_ascii=False)
|
||||
info_path = f.name
|
||||
|
||||
api.upload_file(
|
||||
path_or_fileobj=info_path,
|
||||
path_in_repo="dataset_info.json",
|
||||
repo_id=HF_DATASET_NAME,
|
||||
repo_type="dataset"
|
||||
)
|
||||
os.unlink(info_path)
|
||||
|
||||
print(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{HF_DATASET_NAME}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to push to Hugging Face: {e}")
|
||||
print("💡 Make sure you're logged in with: huggingface-cli login")
|
||||
|
||||
else:
|
||||
print("❌ No confirmed samples to save")
|
||||
52
vosk/test_files/debug_batch_confirm.py
Normal file
52
vosk/test_files/debug_batch_confirm.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import requests
|
||||
import json
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
# Test the API connection
|
||||
API_URL = "http://localhost:5000/batch_confirm"
|
||||
|
||||
def test_api():
|
||||
print("Testing API connection...")
|
||||
try:
|
||||
response = requests.get("http://localhost:5000/")
|
||||
print(f"API health check: {response.status_code}")
|
||||
print(f"Response: {response.json()}")
|
||||
except Exception as e:
|
||||
print(f"API not reachable: {e}")
|
||||
return False
|
||||
return True
|
||||
|
||||
def test_batch_confirm():
|
||||
print("\nTesting batch confirm...")
|
||||
|
||||
# Create a simple test audio file
|
||||
test_audio = np.random.randn(16000).astype(np.float32) # 1 second of noise
|
||||
test_path = "test_audio.flac"
|
||||
sf.write(test_path, test_audio, 16000, format="FLAC")
|
||||
|
||||
# Test batch confirm
|
||||
with open(test_path, "rb") as f:
|
||||
files = {"audio0": f}
|
||||
data = {"references": json.dumps(["test sentence"])}
|
||||
|
||||
try:
|
||||
response = requests.post(API_URL, files=files, data=data, timeout=30)
|
||||
print(f"Batch confirm response: {response.status_code}")
|
||||
if response.status_code == 200:
|
||||
print(f"Response JSON: {response.json()}")
|
||||
else:
|
||||
print(f"Error: {response.text}")
|
||||
except Exception as e:
|
||||
print(f"Batch confirm failed: {e}")
|
||||
|
||||
# Clean up
|
||||
if os.path.exists(test_path):
|
||||
os.remove(test_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if test_api():
|
||||
test_batch_confirm()
|
||||
else:
|
||||
print("Please start the Vosk API first!")
|
||||
33
vosk/test_files/download_large_persian_model.py
Normal file
33
vosk/test_files/download_large_persian_model.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
import requests
|
||||
import zipfile
|
||||
|
||||
MODEL_URL = "https://alphacephei.com/vosk/models/vosk-model-fa-0.42.zip"
|
||||
MODEL_ZIP = "vosk-model-fa-0.42.zip"
|
||||
MODEL_DIR = "vosk-model-fa-0.42"
|
||||
|
||||
# Download the model zip if not present
|
||||
if not os.path.exists(MODEL_ZIP):
|
||||
print(f"Downloading {MODEL_URL} ...")
|
||||
with requests.get(MODEL_URL, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
total = int(r.headers.get('content-length', 0))
|
||||
with open(MODEL_ZIP, 'wb') as f:
|
||||
downloaded = 0
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
print(f"\rDownloaded {downloaded/1024/1024:.2f} MB / {total/1024/1024:.2f} MB", end='', flush=True)
|
||||
print("\nDownload complete.")
|
||||
else:
|
||||
print(f"{MODEL_ZIP} already exists.")
|
||||
|
||||
# Extract the model zip if not already extracted
|
||||
if not os.path.exists(MODEL_DIR):
|
||||
print(f"Extracting {MODEL_ZIP} ...")
|
||||
with zipfile.ZipFile(MODEL_ZIP, 'r') as zip_ref:
|
||||
zip_ref.extractall()
|
||||
print(f"Extracted to {MODEL_DIR}.")
|
||||
else:
|
||||
print(f"{MODEL_DIR} already extracted.")
|
||||
89
vosk/test_files/human_confirm_parquet.py
Normal file
89
vosk/test_files/human_confirm_parquet.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
from PyQt5.QtWidgets import (
|
||||
QApplication, QWidget, QLabel, QPushButton, QVBoxLayout, QHBoxLayout, QMessageBox
|
||||
)
|
||||
|
||||
parquet_path = os.path.join('confirmed_dataset', 'confirmed_shard_00.parquet')
|
||||
df = pd.read_parquet(parquet_path)
|
||||
results = []
|
||||
|
||||
class AudioReviewer(QWidget):
|
||||
def __init__(self, df):
|
||||
super().__init__()
|
||||
self.df = df
|
||||
self.idx = 0
|
||||
self.total = len(df)
|
||||
self.audio = None
|
||||
self.transcription = None
|
||||
|
||||
self.setWindowTitle("Human Audio Confirmation GUI (PyQt5)")
|
||||
self.setGeometry(100, 100, 600, 200)
|
||||
|
||||
self.label = QLabel(f"Sample 1/{self.total}", self)
|
||||
self.trans_label = QLabel("", self)
|
||||
self.play_button = QPushButton("Play Audio", self)
|
||||
self.yes_button = QPushButton("Yes (Correct)", self)
|
||||
self.no_button = QPushButton("No (Incorrect)", self)
|
||||
self.skip_button = QPushButton("Skip", self)
|
||||
self.quit_button = QPushButton("Quit", self)
|
||||
|
||||
self.play_button.clicked.connect(self.play_audio)
|
||||
self.yes_button.clicked.connect(lambda: self.save_and_next('y'))
|
||||
self.no_button.clicked.connect(lambda: self.save_and_next('n'))
|
||||
self.skip_button.clicked.connect(lambda: self.save_and_next('skip'))
|
||||
self.quit_button.clicked.connect(self.quit)
|
||||
|
||||
vbox = QVBoxLayout()
|
||||
vbox.addWidget(self.label)
|
||||
vbox.addWidget(self.trans_label)
|
||||
vbox.addWidget(self.play_button)
|
||||
|
||||
hbox = QHBoxLayout()
|
||||
hbox.addWidget(self.yes_button)
|
||||
hbox.addWidget(self.no_button)
|
||||
hbox.addWidget(self.skip_button)
|
||||
hbox.addWidget(self.quit_button)
|
||||
vbox.addLayout(hbox)
|
||||
|
||||
self.setLayout(vbox)
|
||||
self.load_sample()
|
||||
|
||||
def load_sample(self):
|
||||
if self.idx >= self.total:
|
||||
QMessageBox.information(self, "Done", "All samples reviewed!")
|
||||
self.quit()
|
||||
return
|
||||
row = self.df.iloc[self.idx]
|
||||
# Convert bytes back to numpy array
|
||||
audio_bytes = row['audio']
|
||||
self.audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0 # Convert int16 to float32
|
||||
self.transcription = row['text'] # Use 'text' column instead of 'transcription'
|
||||
self.label.setText(f"Sample {self.idx+1}/{self.total}")
|
||||
self.trans_label.setText(f"Transcription: {self.transcription}")
|
||||
|
||||
def play_audio(self):
|
||||
sd.play(self.audio, 16000)
|
||||
sd.wait()
|
||||
|
||||
def save_and_next(self, result):
|
||||
results.append({
|
||||
'index': self.idx,
|
||||
'transcription': self.transcription,
|
||||
'result': result
|
||||
})
|
||||
self.idx += 1
|
||||
self.load_sample()
|
||||
|
||||
def quit(self):
|
||||
pd.DataFrame(results).to_csv('human_confirmed_results.csv', index=False)
|
||||
self.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = QApplication(sys.argv)
|
||||
reviewer = AudioReviewer(df)
|
||||
reviewer.show()
|
||||
sys.exit(app.exec_())
|
||||
39
vosk/test_files/test_vosk_transcription.py
Normal file
39
vosk/test_files/test_vosk_transcription.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import requests
|
||||
import difflib
|
||||
import sys
|
||||
|
||||
# Usage: python test_vosk_transcription.py <audio_file> <reference_text>
|
||||
|
||||
API_URL = 'http://localhost:5000/transcribe'
|
||||
|
||||
|
||||
def similarity(a, b):
|
||||
return difflib.SequenceMatcher(None, a, b).ratio()
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python test_vosk_transcription.py <audio_file> <reference_text>")
|
||||
sys.exit(1)
|
||||
audio_path = sys.argv[1]
|
||||
reference_text = sys.argv[2]
|
||||
with open(audio_path, 'rb') as f:
|
||||
files = {'audio': f}
|
||||
response = requests.post(API_URL, files=files)
|
||||
if response.status_code != 200:
|
||||
print(f"API error: {response.text}")
|
||||
sys.exit(1)
|
||||
transcription = response.json().get('transcription', '')
|
||||
sim = similarity(transcription, reference_text)
|
||||
print(f"Transcription: {transcription}")
|
||||
print(f"Reference: {reference_text}")
|
||||
print(f"Similarity: {sim:.2f}")
|
||||
if sim > 0.2:
|
||||
print("Test PASSED: Similarity above threshold.")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("Test FAILED: Similarity below threshold.")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
22
vosk/vosk_service/Dockerfile
Normal file
22
vosk/vosk_service/Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt ./
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy service code
|
||||
COPY app.py ./
|
||||
|
||||
# Copy model directory
|
||||
COPY model/ ./model/
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
CMD ["python", "app.py"]
|
||||
26
vosk/vosk_service/README.md
Normal file
26
vosk/vosk_service/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# Vosk Speech-to-Text Docker Service
|
||||
|
||||
## Setup
|
||||
|
||||
1. Download and extract a Vosk model (already downloading `vosk-model-small-en-us-0.15.zip`):
|
||||
|
||||
```sh
|
||||
unzip model.zip -d model
|
||||
mv model/* model/
|
||||
```
|
||||
|
||||
2. Build the Docker image:
|
||||
|
||||
```sh
|
||||
docker build -t vosk-api .
|
||||
```
|
||||
|
||||
3. Run the Docker container (mounting the model directory):
|
||||
|
||||
```sh
|
||||
docker run -p 5000:5000 -v $(pwd)/model:/app/model vosk-api
|
||||
```
|
||||
|
||||
## API Usage
|
||||
|
||||
POST `/transcribe` with form-data key `audio` (WAV/FLAC/OGG file). Returns JSON with `transcription`.
|
||||
108
vosk/vosk_service/app.py
Normal file
108
vosk/vosk_service/app.py
Normal file
@@ -0,0 +1,108 @@
|
||||
from flask import Flask, request, jsonify
|
||||
from vosk import Model, KaldiRecognizer
|
||||
import soundfile as sf
|
||||
import io
|
||||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
from multiprocessing import Process, Queue
|
||||
import difflib
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
MODEL_PATH = "/app/model"
|
||||
|
||||
# Check if model exists and load it
|
||||
print(f"Checking for model at: {MODEL_PATH}")
|
||||
if os.path.exists(MODEL_PATH):
|
||||
print(f"Model directory exists at {MODEL_PATH}")
|
||||
print(f"Contents: {os.listdir(MODEL_PATH)}")
|
||||
try:
|
||||
model = Model(MODEL_PATH)
|
||||
print("Model loaded successfully!")
|
||||
except Exception as e:
|
||||
print(f"Error loading model: {e}")
|
||||
raise RuntimeError(f"Failed to load Vosk model: {e}")
|
||||
else:
|
||||
print(f"Model directory not found at {MODEL_PATH}")
|
||||
raise RuntimeError(f"Vosk model not found at {MODEL_PATH}. Please download and mount a model.")
|
||||
|
||||
def similarity(a, b):
|
||||
return difflib.SequenceMatcher(None, a, b).ratio()
|
||||
|
||||
def confirm_voice(audio_bytes, reference_text, samplerate, queue):
|
||||
data, _ = sf.read(io.BytesIO(audio_bytes))
|
||||
if len(data.shape) > 1:
|
||||
data = data[:, 0]
|
||||
if data.dtype != np.int16:
|
||||
data = (data * 32767).astype(np.int16)
|
||||
recognizer = KaldiRecognizer(model, samplerate)
|
||||
recognizer.AcceptWaveform(data.tobytes())
|
||||
result = recognizer.Result()
|
||||
text = json.loads(result).get('text', '')
|
||||
sim = similarity(text, reference_text)
|
||||
queue.put({'transcription': text, 'similarity': sim, 'confirmed': sim > 0.2})
|
||||
|
||||
@app.route('/', methods=['GET'])
|
||||
def health_check():
|
||||
return jsonify({'status': 'ok', 'service': 'vosk-transcription-api', 'model': 'persian'})
|
||||
|
||||
@app.route('/batch_confirm', methods=['POST'])
|
||||
def batch_confirm():
|
||||
# Expecting a multipart/form-data with multiple audio files and a JSON list of references
|
||||
# audio files: audio0, audio1, ...
|
||||
# references: JSON list in 'references' field
|
||||
references = request.form.get('references')
|
||||
if not references:
|
||||
return jsonify({'error': 'Missing references'}), 400
|
||||
try:
|
||||
references = json.loads(references)
|
||||
except Exception:
|
||||
return jsonify({'error': 'Invalid references JSON'}), 400
|
||||
audio_files = []
|
||||
for i in range(len(references)):
|
||||
audio_file = request.files.get(f'audio{i}')
|
||||
if not audio_file:
|
||||
return jsonify({'error': f'Missing audio file audio{i}'}), 400
|
||||
audio_files.append(audio_file.read())
|
||||
results = []
|
||||
processes = []
|
||||
queues = []
|
||||
# Get sample rates for each audio
|
||||
samplerates = []
|
||||
for audio_bytes in audio_files:
|
||||
data, samplerate = sf.read(io.BytesIO(audio_bytes))
|
||||
samplerates.append(samplerate)
|
||||
for idx, (audio_bytes, reference_text, samplerate) in enumerate(zip(audio_files, references, samplerates)):
|
||||
queue = Queue()
|
||||
p = Process(target=confirm_voice, args=(audio_bytes, reference_text, samplerate, queue))
|
||||
processes.append(p)
|
||||
queues.append(queue)
|
||||
p.start()
|
||||
for p in processes:
|
||||
p.join()
|
||||
for queue in queues:
|
||||
results.append(queue.get())
|
||||
return jsonify({'results': results})
|
||||
|
||||
@app.route('/transcribe', methods=['POST'])
|
||||
def transcribe():
|
||||
if 'audio' not in request.files:
|
||||
return jsonify({'error': 'No audio file provided'}), 400
|
||||
audio_file = request.files['audio']
|
||||
audio_bytes = audio_file.read()
|
||||
data, samplerate = sf.read(io.BytesIO(audio_bytes))
|
||||
if len(data.shape) > 1:
|
||||
data = data[:, 0] # Use first channel if stereo
|
||||
# Convert to 16-bit PCM
|
||||
if data.dtype != np.int16:
|
||||
data = (data * 32767).astype(np.int16)
|
||||
recognizer = KaldiRecognizer(model, samplerate)
|
||||
recognizer.AcceptWaveform(data.tobytes())
|
||||
result = recognizer.Result()
|
||||
print(result) # For debugging
|
||||
text = json.loads(result).get('text', '')
|
||||
return jsonify({'transcription': text})
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000)
|
||||
3
vosk/vosk_service/requirements.txt
Normal file
3
vosk/vosk_service/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
vosk
|
||||
Flask
|
||||
soundfile
|
||||
Reference in New Issue
Block a user