Another iteration from deepseek
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
from fastapi import FastAPI
|
||||
import redis
|
||||
from fastapi import FastAPI, File, UploadFile
|
||||
import redis.asyncio as redis
|
||||
import os
|
||||
|
||||
app = FastAPI()
|
||||
@@ -12,4 +12,7 @@ r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
|
||||
async def health():
|
||||
return {"status": "ok", "service": "asr"}
|
||||
|
||||
# Позже здесь будут эндпоинты для приёма аудио и возврата текста
|
||||
@app.post("/transcribe")
|
||||
async def transcribe(audio: UploadFile = File(...)):
|
||||
# Пока просто заглушка
|
||||
return {"text": "тестовая транскрипция"}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.1
|
||||
redis==5.0.4
|
||||
redis>=5.0.0
|
||||
pydantic==2.8.2
|
||||
python-dotenv==1.0.1
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.1
|
||||
redis==5.0.4
|
||||
redis>=5.0.0
|
||||
pydantic==2.8.2
|
||||
python-dotenv==1.0.1
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.1
|
||||
redis==5.0.4
|
||||
redis>=5.0.0
|
||||
pydantic==2.8.2
|
||||
python-dotenv==1.0.1
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.1
|
||||
redis==5.0.4
|
||||
redis>=5.0.0
|
||||
pydantic==2.8.2
|
||||
python-dotenv==1.0.1
|
||||
|
||||
@@ -1,26 +1,153 @@
|
||||
from fastapi import FastAPI
|
||||
import redis
|
||||
import asyncio
|
||||
import numpy as np
|
||||
import torch
|
||||
import redis.asyncio as redis
|
||||
from fastapi import FastAPI, WebSocket, HTTPException
|
||||
import os
|
||||
import io
|
||||
import wave
|
||||
import requests
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Redis connection
|
||||
redis_host = os.getenv("REDIS_HOST", "localhost")
|
||||
redis_port = int(os.getenv("REDIS_PORT", 6379))
|
||||
r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
|
||||
|
||||
# Silero VAD model (load once)
|
||||
print("Loading Silero VAD model...")
|
||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
|
||||
(get_speech_timestamps, _, _, _, _) = utils
|
||||
print("VAD model loaded.")
|
||||
|
||||
# Configuration
|
||||
SAMPLE_RATE = 16000
|
||||
MIN_SPEECH_DURATION = 0.5 # seconds
|
||||
MIN_SILENCE_DURATION = 0.5 # seconds
|
||||
BUFFER_MAX_DURATION = 30 # max buffer size in seconds
|
||||
BUFFER_MAX_SAMPLES = BUFFER_MAX_DURATION * SAMPLE_RATE
|
||||
|
||||
# Audio buffer for current speech segment
|
||||
audio_buffer = bytearray()
|
||||
|
||||
# State
|
||||
is_speaking = False
|
||||
last_speech_end = 0
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok", "service": "vad"}
|
||||
|
||||
@app.websocket("/audio-stream")
|
||||
async def audio_stream(websocket):
|
||||
async def audio_stream(websocket: WebSocket):
|
||||
global audio_buffer, is_speaking, last_speech_end
|
||||
await websocket.accept()
|
||||
print("Client connected to VAD")
|
||||
|
||||
# We'll collect audio in a temporary buffer for VAD processing
|
||||
temp_buffer = bytearray()
|
||||
|
||||
try:
|
||||
while True:
|
||||
data = await websocket.receive_bytes()
|
||||
# Пока ничего не делаем с данными
|
||||
# В будущем здесь будет VAD-обработка
|
||||
pass
|
||||
# Receive audio chunk (bytes, int16, 16kHz mono)
|
||||
chunk = await websocket.receive_bytes()
|
||||
temp_buffer.extend(chunk)
|
||||
|
||||
# Process when we have enough for a window (e.g., 500ms)
|
||||
if len(temp_buffer) >= SAMPLE_RATE // 2 * 2: # 500ms = 8000 samples (each 2 bytes)
|
||||
# Convert to numpy array (int16) then to float32 for VAD
|
||||
audio_int16 = np.frombuffer(temp_buffer, dtype=np.int16)
|
||||
audio_float32 = audio_int16.astype(np.float32) / 32768.0
|
||||
|
||||
# Get speech timestamps for this chunk
|
||||
speech_ts = get_speech_timestamps(
|
||||
audio_float32,
|
||||
model,
|
||||
sampling_rate=SAMPLE_RATE,
|
||||
threshold=0.5,
|
||||
min_speech_duration_ms=int(MIN_SPEECH_DURATION * 1000),
|
||||
min_silence_duration_ms=int(MIN_SILENCE_DURATION * 1000)
|
||||
)
|
||||
|
||||
if speech_ts:
|
||||
# There is speech in this chunk
|
||||
if not is_speaking:
|
||||
# Speech just started
|
||||
is_speaking = True
|
||||
# Send interrupt signal to Redis (to stop TTS)
|
||||
await r.publish("interrupt", "1")
|
||||
print("Speech started, interrupt sent")
|
||||
|
||||
# Add the whole chunk to the main buffer (we might want to cut exactly, but for simplicity)
|
||||
audio_buffer.extend(temp_buffer)
|
||||
temp_buffer.clear()
|
||||
else:
|
||||
# No speech in this chunk
|
||||
if is_speaking:
|
||||
# Possible end of speech, but we should wait a bit before finalizing
|
||||
# For now, we'll just accumulate silence for a while, then send
|
||||
audio_buffer.extend(temp_buffer)
|
||||
temp_buffer.clear()
|
||||
|
||||
# Check if silence duration exceeded threshold
|
||||
# We need to track time, but for simplicity we'll assume this chunk is silence and just continue
|
||||
# A real implementation would check accumulated silence length
|
||||
# Here we'll just keep adding, and later send when buffer stops growing for some time
|
||||
|
||||
# However, for a simple prototype, we can just send when we detect end of speech
|
||||
# We'll use a timer: after speech, start a timer; if no new speech within 1 sec, send
|
||||
# We'll implement timer later. For now, we'll just send after 1 second of no speech
|
||||
# This requires asyncio.sleep and tracking.
|
||||
# Instead, we'll add a simple approach: after each chunk with no speech, if buffer not empty, check if we should send.
|
||||
# But to avoid complexity, we'll just send after a fixed silence threshold.
|
||||
# Let's implement a more robust approach using asyncio.create_task
|
||||
asyncio.create_task(flush_after_silence())
|
||||
|
||||
else:
|
||||
# No speech and not speaking, just keep temp buffer but maybe drop if too long? We'll limit.
|
||||
if len(temp_buffer) > SAMPLE_RATE * 10: # keep max 10 seconds non-speech
|
||||
temp_buffer.clear()
|
||||
|
||||
# Limit main buffer size
|
||||
if len(audio_buffer) > BUFFER_MAX_SAMPLES * 2:
|
||||
audio_buffer = audio_buffer[-BUFFER_MAX_SAMPLES * 2:]
|
||||
|
||||
except Exception as e:
|
||||
print(f"VAD connection closed: {e}")
|
||||
finally:
|
||||
# Clean up if needed
|
||||
pass
|
||||
|
||||
async def flush_after_silence():
|
||||
"""Wait for silence duration then send audio to ASR."""
|
||||
await asyncio.sleep(MIN_SILENCE_DURATION)
|
||||
global audio_buffer, is_speaking
|
||||
if audio_buffer and not is_speaking:
|
||||
# Send to ASR
|
||||
await send_to_asr(audio_buffer)
|
||||
audio_buffer.clear()
|
||||
|
||||
async def send_to_asr(audio_data: bytes):
|
||||
"""Send audio segment to ASR service and get transcription."""
|
||||
# Prepare WAV file in memory
|
||||
wav_io = io.BytesIO()
|
||||
with wave.open(wav_io, 'wb') as wav:
|
||||
wav.setnchannels(1)
|
||||
wav.setsampwidth(2) # 16-bit
|
||||
wav.setframerate(SAMPLE_RATE)
|
||||
wav.writeframes(audio_data)
|
||||
wav_io.seek(0)
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://jarvis-asr:8000/transcribe",
|
||||
files={"audio": ("audio.wav", wav_io, "audio/wav")}
|
||||
)
|
||||
if response.status_code == 200:
|
||||
text = response.json().get("text", "")
|
||||
print(f"ASR result: {text}")
|
||||
# Here you would send the text to orchestrator for further processing
|
||||
# For now, just publish to Redis? We'll do later.
|
||||
except Exception as e:
|
||||
print(f"Error sending to ASR: {e}")
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.1
|
||||
redis==5.0.4
|
||||
redis>=5.0.0
|
||||
pydantic==2.8.2
|
||||
python-dotenv==1.0.1
|
||||
# Позже добавим silero-vad, numpy, torch и т.д.
|
||||
numpy==1.26.4
|
||||
torch==2.5.1
|
||||
silero-vad==6.2.0
|
||||
|
||||
Reference in New Issue
Block a user