Another iteration from deepseek

2026-02-20 03:10:07 +03:00
parent eaa2b1bd7c
commit 2a9b1022c1
7 changed files with 148 additions and 16 deletions
--- a/services/asr/app.py
+++ b/services/asr/app.py
@@ -1,5 +1,5 @@
-from fastapi import FastAPI
+from fastapi import FastAPI, File, UploadFile
-import redis
+import redis.asyncio as redis
 import os
 app = FastAPI()
@@ -12,4 +12,7 @@ r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
 async def health():
    return {"status": "ok", "service": "asr"}
-# Позже здесь будут эндпоинты для приёма аудио и возврата текста
+@app.post("/transcribe")
 async def transcribe(audio: UploadFile = File(...)):
    # Пока просто заглушка
    return {"text": "тестовая транскрипция"}
--- a/services/asr/requirements.txt
+++ b/services/asr/requirements.txt
@@ -1,5 +1,5 @@
 fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
--- a/services/orchestrator/requirements.txt
+++ b/services/orchestrator/requirements.txt
@@ -1,5 +1,5 @@
 fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
--- a/services/speaker-id/requirements.txt
+++ b/services/speaker-id/requirements.txt
@@ -1,5 +1,5 @@
 fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
--- a/services/tts/requirements.txt
+++ b/services/tts/requirements.txt
@@ -1,5 +1,5 @@
 fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
--- a/services/vad/app.py
+++ b/services/vad/app.py
@@ -1,26 +1,153 @@
-from fastapi import FastAPI
+import asyncio
-import redis
+import numpy as np
 import torch
 import redis.asyncio as redis
 from fastapi import FastAPI, WebSocket, HTTPException
 import os
 import io
 import wave
 import requests
 app = FastAPI()
 # Redis connection
 redis_host = os.getenv("REDIS_HOST", "localhost")
 redis_port = int(os.getenv("REDIS_PORT", 6379))
 r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
 # Silero VAD model (load once)
 print("Loading Silero VAD model...")
 model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
 (get_speech_timestamps, _, _, _, _) = utils
 print("VAD model loaded.")
 # Configuration
 SAMPLE_RATE = 16000
 MIN_SPEECH_DURATION = 0.5  # seconds
 MIN_SILENCE_DURATION = 0.5  # seconds
 BUFFER_MAX_DURATION = 30  # max buffer size in seconds
 BUFFER_MAX_SAMPLES = BUFFER_MAX_DURATION * SAMPLE_RATE
 # Audio buffer for current speech segment
 audio_buffer = bytearray()
 # State
 is_speaking = False
 last_speech_end = 0
@app.get("/health")
 async def health():
    return {"status": "ok", "service": "vad"}
@app.websocket("/audio-stream")
-async def audio_stream(websocket):
+async def audio_stream(websocket: WebSocket):
    global audio_buffer, is_speaking, last_speech_end
    await websocket.accept()
    print("Client connected to VAD")
    # We'll collect audio in a temporary buffer for VAD processing
    temp_buffer = bytearray()
    try:
        while True:
-            data = await websocket.receive_bytes()
+            # Receive audio chunk (bytes, int16, 16kHz mono)
-            # Пока ничего не делаем с данными
+            chunk = await websocket.receive_bytes()
-            # В будущем здесь будет VAD-обработка
+            temp_buffer.extend(chunk)
-            pass
+            
            # Process when we have enough for a window (e.g., 500ms)
            if len(temp_buffer) >= SAMPLE_RATE // 2 * 2:  # 500ms = 8000 samples (each 2 bytes)
                # Convert to numpy array (int16) then to float32 for VAD
                audio_int16 = np.frombuffer(temp_buffer, dtype=np.int16)
                audio_float32 = audio_int16.astype(np.float32) / 32768.0
                # Get speech timestamps for this chunk
                speech_ts = get_speech_timestamps(
                    audio_float32, 
                    model, 
                    sampling_rate=SAMPLE_RATE,
                    threshold=0.5,
                    min_speech_duration_ms=int(MIN_SPEECH_DURATION * 1000),
                    min_silence_duration_ms=int(MIN_SILENCE_DURATION * 1000)
                )
                if speech_ts:
                    # There is speech in this chunk
                    if not is_speaking:
                        # Speech just started
                        is_speaking = True
                        # Send interrupt signal to Redis (to stop TTS)
                        await r.publish("interrupt", "1")
                        print("Speech started, interrupt sent")
                    # Add the whole chunk to the main buffer (we might want to cut exactly, but for simplicity)
                    audio_buffer.extend(temp_buffer)
                    temp_buffer.clear()
                else:
                    # No speech in this chunk
                    if is_speaking:
                        # Possible end of speech, but we should wait a bit before finalizing
                        # For now, we'll just accumulate silence for a while, then send
                        audio_buffer.extend(temp_buffer)
                        temp_buffer.clear()
                        # Check if silence duration exceeded threshold
                        # We need to track time, but for simplicity we'll assume this chunk is silence and just continue
                        # A real implementation would check accumulated silence length
                        # Here we'll just keep adding, and later send when buffer stops growing for some time
                        # However, for a simple prototype, we can just send when we detect end of speech
                        # We'll use a timer: after speech, start a timer; if no new speech within 1 sec, send
                        # We'll implement timer later. For now, we'll just send after 1 second of no speech
                        # This requires asyncio.sleep and tracking.
                        # Instead, we'll add a simple approach: after each chunk with no speech, if buffer not empty, check if we should send.
                        # But to avoid complexity, we'll just send after a fixed silence threshold.
                        # Let's implement a more robust approach using asyncio.create_task
                        asyncio.create_task(flush_after_silence())
                    else:
                        # No speech and not speaking, just keep temp buffer but maybe drop if too long? We'll limit.
                        if len(temp_buffer) > SAMPLE_RATE * 10:  # keep max 10 seconds non-speech
                            temp_buffer.clear()
            # Limit main buffer size
            if len(audio_buffer) > BUFFER_MAX_SAMPLES * 2:
                audio_buffer = audio_buffer[-BUFFER_MAX_SAMPLES * 2:]
    except Exception as e:
        print(f"VAD connection closed: {e}")
    finally:
        # Clean up if needed
        pass
 async def flush_after_silence():
    """Wait for silence duration then send audio to ASR."""
    await asyncio.sleep(MIN_SILENCE_DURATION)
    global audio_buffer, is_speaking
    if audio_buffer and not is_speaking:
        # Send to ASR
        await send_to_asr(audio_buffer)
        audio_buffer.clear()
 async def send_to_asr(audio_data: bytes):
    """Send audio segment to ASR service and get transcription."""
    # Prepare WAV file in memory
    wav_io = io.BytesIO()
    with wave.open(wav_io, 'wb') as wav:
        wav.setnchannels(1)
        wav.setsampwidth(2)  # 16-bit
        wav.setframerate(SAMPLE_RATE)
        wav.writeframes(audio_data)
    wav_io.seek(0)
    try:
        response = requests.post(
            "http://jarvis-asr:8000/transcribe",
            files={"audio": ("audio.wav", wav_io, "audio/wav")}
        )
        if response.status_code == 200:
            text = response.json().get("text", "")
            print(f"ASR result: {text}")
            # Here you would send the text to orchestrator for further processing
            # For now, just publish to Redis? We'll do later.
    except Exception as e:
        print(f"Error sending to ASR: {e}")
--- a/services/vad/requirements.txt
+++ b/services/vad/requirements.txt
@@ -1,6 +1,8 @@
 fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
-# Позже добавим silero-vad, numpy, torch и т.д.
+numpy==1.26.4
 torch==2.5.1
 silero-vad==6.2.0