Sure! Pl
This commit is contained in:
22
vosk/vosk_service/Dockerfile
Normal file
22
vosk/vosk_service/Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt ./
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy service code
|
||||
COPY app.py ./
|
||||
|
||||
# Copy model directory
|
||||
COPY model/ ./model/
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
CMD ["python", "app.py"]
|
||||
26
vosk/vosk_service/README.md
Normal file
26
vosk/vosk_service/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# Vosk Speech-to-Text Docker Service
|
||||
|
||||
## Setup
|
||||
|
||||
1. Download and extract a Vosk model (already downloading `vosk-model-small-en-us-0.15.zip`):
|
||||
|
||||
```sh
|
||||
unzip model.zip -d model
|
||||
mv model/* model/
|
||||
```
|
||||
|
||||
2. Build the Docker image:
|
||||
|
||||
```sh
|
||||
docker build -t vosk-api .
|
||||
```
|
||||
|
||||
3. Run the Docker container (mounting the model directory):
|
||||
|
||||
```sh
|
||||
docker run -p 5000:5000 -v $(pwd)/model:/app/model vosk-api
|
||||
```
|
||||
|
||||
## API Usage
|
||||
|
||||
POST `/transcribe` with form-data key `audio` (WAV/FLAC/OGG file). Returns JSON with `transcription`.
|
||||
108
vosk/vosk_service/app.py
Normal file
108
vosk/vosk_service/app.py
Normal file
@@ -0,0 +1,108 @@
|
||||
from flask import Flask, request, jsonify
|
||||
from vosk import Model, KaldiRecognizer
|
||||
import soundfile as sf
|
||||
import io
|
||||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
from multiprocessing import Process, Queue
|
||||
import difflib
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
MODEL_PATH = "/app/model"
|
||||
|
||||
# Check if model exists and load it
|
||||
print(f"Checking for model at: {MODEL_PATH}")
|
||||
if os.path.exists(MODEL_PATH):
|
||||
print(f"Model directory exists at {MODEL_PATH}")
|
||||
print(f"Contents: {os.listdir(MODEL_PATH)}")
|
||||
try:
|
||||
model = Model(MODEL_PATH)
|
||||
print("Model loaded successfully!")
|
||||
except Exception as e:
|
||||
print(f"Error loading model: {e}")
|
||||
raise RuntimeError(f"Failed to load Vosk model: {e}")
|
||||
else:
|
||||
print(f"Model directory not found at {MODEL_PATH}")
|
||||
raise RuntimeError(f"Vosk model not found at {MODEL_PATH}. Please download and mount a model.")
|
||||
|
||||
def similarity(a, b):
|
||||
return difflib.SequenceMatcher(None, a, b).ratio()
|
||||
|
||||
def confirm_voice(audio_bytes, reference_text, samplerate, queue):
|
||||
data, _ = sf.read(io.BytesIO(audio_bytes))
|
||||
if len(data.shape) > 1:
|
||||
data = data[:, 0]
|
||||
if data.dtype != np.int16:
|
||||
data = (data * 32767).astype(np.int16)
|
||||
recognizer = KaldiRecognizer(model, samplerate)
|
||||
recognizer.AcceptWaveform(data.tobytes())
|
||||
result = recognizer.Result()
|
||||
text = json.loads(result).get('text', '')
|
||||
sim = similarity(text, reference_text)
|
||||
queue.put({'transcription': text, 'similarity': sim, 'confirmed': sim > 0.2})
|
||||
|
||||
@app.route('/', methods=['GET'])
|
||||
def health_check():
|
||||
return jsonify({'status': 'ok', 'service': 'vosk-transcription-api', 'model': 'persian'})
|
||||
|
||||
@app.route('/batch_confirm', methods=['POST'])
|
||||
def batch_confirm():
|
||||
# Expecting a multipart/form-data with multiple audio files and a JSON list of references
|
||||
# audio files: audio0, audio1, ...
|
||||
# references: JSON list in 'references' field
|
||||
references = request.form.get('references')
|
||||
if not references:
|
||||
return jsonify({'error': 'Missing references'}), 400
|
||||
try:
|
||||
references = json.loads(references)
|
||||
except Exception:
|
||||
return jsonify({'error': 'Invalid references JSON'}), 400
|
||||
audio_files = []
|
||||
for i in range(len(references)):
|
||||
audio_file = request.files.get(f'audio{i}')
|
||||
if not audio_file:
|
||||
return jsonify({'error': f'Missing audio file audio{i}'}), 400
|
||||
audio_files.append(audio_file.read())
|
||||
results = []
|
||||
processes = []
|
||||
queues = []
|
||||
# Get sample rates for each audio
|
||||
samplerates = []
|
||||
for audio_bytes in audio_files:
|
||||
data, samplerate = sf.read(io.BytesIO(audio_bytes))
|
||||
samplerates.append(samplerate)
|
||||
for idx, (audio_bytes, reference_text, samplerate) in enumerate(zip(audio_files, references, samplerates)):
|
||||
queue = Queue()
|
||||
p = Process(target=confirm_voice, args=(audio_bytes, reference_text, samplerate, queue))
|
||||
processes.append(p)
|
||||
queues.append(queue)
|
||||
p.start()
|
||||
for p in processes:
|
||||
p.join()
|
||||
for queue in queues:
|
||||
results.append(queue.get())
|
||||
return jsonify({'results': results})
|
||||
|
||||
@app.route('/transcribe', methods=['POST'])
|
||||
def transcribe():
|
||||
if 'audio' not in request.files:
|
||||
return jsonify({'error': 'No audio file provided'}), 400
|
||||
audio_file = request.files['audio']
|
||||
audio_bytes = audio_file.read()
|
||||
data, samplerate = sf.read(io.BytesIO(audio_bytes))
|
||||
if len(data.shape) > 1:
|
||||
data = data[:, 0] # Use first channel if stereo
|
||||
# Convert to 16-bit PCM
|
||||
if data.dtype != np.int16:
|
||||
data = (data * 32767).astype(np.int16)
|
||||
recognizer = KaldiRecognizer(model, samplerate)
|
||||
recognizer.AcceptWaveform(data.tobytes())
|
||||
result = recognizer.Result()
|
||||
print(result) # For debugging
|
||||
text = json.loads(result).get('text', '')
|
||||
return jsonify({'transcription': text})
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000)
|
||||
3
vosk/vosk_service/requirements.txt
Normal file
3
vosk/vosk_service/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
vosk
|
||||
Flask
|
||||
soundfile
|
||||
Reference in New Issue
Block a user