This commit is contained in:
Alireza
2025-07-31 17:35:08 +03:30
commit 640363fef2
27 changed files with 4201 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
FROM python:3.10-slim
# Install dependencies
RUN apt-get update && apt-get install -y \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Install Python dependencies
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
# Copy service code
COPY app.py ./
# Copy model directory
COPY model/ ./model/
EXPOSE 5000
CMD ["python", "app.py"]

View File

@@ -0,0 +1,26 @@
# Vosk Speech-to-Text Docker Service
## Setup
1. Download and extract a Vosk model (already downloading `vosk-model-small-en-us-0.15.zip`):
```sh
unzip model.zip -d model
mv model/* model/
```
2. Build the Docker image:
```sh
docker build -t vosk-api .
```
3. Run the Docker container (mounting the model directory):
```sh
docker run -p 5000:5000 -v $(pwd)/model:/app/model vosk-api
```
## API Usage
POST `/transcribe` with form-data key `audio` (WAV/FLAC/OGG file). Returns JSON with `transcription`.

108
vosk/vosk_service/app.py Normal file
View File

@@ -0,0 +1,108 @@
from flask import Flask, request, jsonify
from vosk import Model, KaldiRecognizer
import soundfile as sf
import io
import os
import json
import numpy as np
from multiprocessing import Process, Queue
import difflib
app = Flask(__name__)
MODEL_PATH = "/app/model"
# Check if model exists and load it
print(f"Checking for model at: {MODEL_PATH}")
if os.path.exists(MODEL_PATH):
print(f"Model directory exists at {MODEL_PATH}")
print(f"Contents: {os.listdir(MODEL_PATH)}")
try:
model = Model(MODEL_PATH)
print("Model loaded successfully!")
except Exception as e:
print(f"Error loading model: {e}")
raise RuntimeError(f"Failed to load Vosk model: {e}")
else:
print(f"Model directory not found at {MODEL_PATH}")
raise RuntimeError(f"Vosk model not found at {MODEL_PATH}. Please download and mount a model.")
def similarity(a, b):
return difflib.SequenceMatcher(None, a, b).ratio()
def confirm_voice(audio_bytes, reference_text, samplerate, queue):
data, _ = sf.read(io.BytesIO(audio_bytes))
if len(data.shape) > 1:
data = data[:, 0]
if data.dtype != np.int16:
data = (data * 32767).astype(np.int16)
recognizer = KaldiRecognizer(model, samplerate)
recognizer.AcceptWaveform(data.tobytes())
result = recognizer.Result()
text = json.loads(result).get('text', '')
sim = similarity(text, reference_text)
queue.put({'transcription': text, 'similarity': sim, 'confirmed': sim > 0.2})
@app.route('/', methods=['GET'])
def health_check():
return jsonify({'status': 'ok', 'service': 'vosk-transcription-api', 'model': 'persian'})
@app.route('/batch_confirm', methods=['POST'])
def batch_confirm():
# Expecting a multipart/form-data with multiple audio files and a JSON list of references
# audio files: audio0, audio1, ...
# references: JSON list in 'references' field
references = request.form.get('references')
if not references:
return jsonify({'error': 'Missing references'}), 400
try:
references = json.loads(references)
except Exception:
return jsonify({'error': 'Invalid references JSON'}), 400
audio_files = []
for i in range(len(references)):
audio_file = request.files.get(f'audio{i}')
if not audio_file:
return jsonify({'error': f'Missing audio file audio{i}'}), 400
audio_files.append(audio_file.read())
results = []
processes = []
queues = []
# Get sample rates for each audio
samplerates = []
for audio_bytes in audio_files:
data, samplerate = sf.read(io.BytesIO(audio_bytes))
samplerates.append(samplerate)
for idx, (audio_bytes, reference_text, samplerate) in enumerate(zip(audio_files, references, samplerates)):
queue = Queue()
p = Process(target=confirm_voice, args=(audio_bytes, reference_text, samplerate, queue))
processes.append(p)
queues.append(queue)
p.start()
for p in processes:
p.join()
for queue in queues:
results.append(queue.get())
return jsonify({'results': results})
@app.route('/transcribe', methods=['POST'])
def transcribe():
if 'audio' not in request.files:
return jsonify({'error': 'No audio file provided'}), 400
audio_file = request.files['audio']
audio_bytes = audio_file.read()
data, samplerate = sf.read(io.BytesIO(audio_bytes))
if len(data.shape) > 1:
data = data[:, 0] # Use first channel if stereo
# Convert to 16-bit PCM
if data.dtype != np.int16:
data = (data * 32767).astype(np.int16)
recognizer = KaldiRecognizer(model, samplerate)
recognizer.AcceptWaveform(data.tobytes())
result = recognizer.Result()
print(result) # For debugging
text = json.loads(result).get('text', '')
return jsonify({'transcription': text})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)

View File

@@ -0,0 +1,3 @@
vosk
Flask
soundfile