From 2a9b1022c15ef8a3efc2db3ca05dc96039ab3cb031800604dee84f5f7dd6cb9b Mon Sep 17 00:00:00 2001
From: Komisar <komisar666@gmail.com>
Date: Fri, 20 Feb 2026 03:10:07 +0300
Subject: [PATCH] Another iteration from deepseek

---
 services/asr/app.py                    |   9 +-
 services/asr/requirements.txt          |   2 +-
 services/orchestrator/requirements.txt |   2 +-
 services/speaker-id/requirements.txt   |   2 +-
 services/tts/requirements.txt          |   2 +-
 services/vad/app.py                    | 141 +++++++++++++++++++++++--
 services/vad/requirements.txt          |   6 +-
 7 files changed, 148 insertions(+), 16 deletions(-)

diff --git a/services/asr/app.py b/services/asr/app.py
index 197ae9e..013ca47 100644
--- a/services/asr/app.py
+++ b/services/asr/app.py
@@ -1,5 +1,5 @@
-﻿from fastapi import FastAPI
-import redis
+﻿from fastapi import FastAPI, File, UploadFile
+import redis.asyncio as redis
 import os
 
 app = FastAPI()
@@ -12,4 +12,7 @@ r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
 async def health():
     return {"status": "ok", "service": "asr"}
 
-# Позже здесь будут эндпоинты для приёма аудио и возврата текста
+@app.post("/transcribe")
+async def transcribe(audio: UploadFile = File(...)):
+    # Пока просто заглушка
+    return {"text": "тестовая транскрипция"}
diff --git a/services/asr/requirements.txt b/services/asr/requirements.txt
index 69de3dc..28114cd 100644
--- a/services/asr/requirements.txt
+++ b/services/asr/requirements.txt
@@ -1,5 +1,5 @@
 ﻿fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
diff --git a/services/orchestrator/requirements.txt b/services/orchestrator/requirements.txt
index 69de3dc..28114cd 100644
--- a/services/orchestrator/requirements.txt
+++ b/services/orchestrator/requirements.txt
@@ -1,5 +1,5 @@
 ﻿fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
diff --git a/services/speaker-id/requirements.txt b/services/speaker-id/requirements.txt
index 69de3dc..28114cd 100644
--- a/services/speaker-id/requirements.txt
+++ b/services/speaker-id/requirements.txt
@@ -1,5 +1,5 @@
 ﻿fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
diff --git a/services/tts/requirements.txt b/services/tts/requirements.txt
index 69de3dc..28114cd 100644
--- a/services/tts/requirements.txt
+++ b/services/tts/requirements.txt
@@ -1,5 +1,5 @@
 ﻿fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
diff --git a/services/vad/app.py b/services/vad/app.py
index fb2b4f5..8a8ac45 100644
--- a/services/vad/app.py
+++ b/services/vad/app.py
@@ -1,26 +1,153 @@
-﻿from fastapi import FastAPI
-import redis
+﻿import asyncio
+import numpy as np
+import torch
+import redis.asyncio as redis
+from fastapi import FastAPI, WebSocket, HTTPException
 import os
+import io
+import wave
+import requests
 
 app = FastAPI()
 
+# Redis connection
 redis_host = os.getenv("REDIS_HOST", "localhost")
 redis_port = int(os.getenv("REDIS_PORT", 6379))
 r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
 
+# Silero VAD model (load once)
+print("Loading Silero VAD model...")
+model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
+(get_speech_timestamps, _, _, _, _) = utils
+print("VAD model loaded.")
+
+# Configuration
+SAMPLE_RATE = 16000
+MIN_SPEECH_DURATION = 0.5  # seconds
+MIN_SILENCE_DURATION = 0.5  # seconds
+BUFFER_MAX_DURATION = 30  # max buffer size in seconds
+BUFFER_MAX_SAMPLES = BUFFER_MAX_DURATION * SAMPLE_RATE
+
+# Audio buffer for current speech segment
+audio_buffer = bytearray()
+
+# State
+is_speaking = False
+last_speech_end = 0
+
 @app.get("/health")
 async def health():
     return {"status": "ok", "service": "vad"}
 
 @app.websocket("/audio-stream")
-async def audio_stream(websocket):
+async def audio_stream(websocket: WebSocket):
+    global audio_buffer, is_speaking, last_speech_end
     await websocket.accept()
     print("Client connected to VAD")
+    
+    # We'll collect audio in a temporary buffer for VAD processing
+    temp_buffer = bytearray()
+    
     try:
         while True:
-            data = await websocket.receive_bytes()
-            # Пока ничего не делаем с данными
-            # В будущем здесь будет VAD-обработка
-            pass
+            # Receive audio chunk (bytes, int16, 16kHz mono)
+            chunk = await websocket.receive_bytes()
+            temp_buffer.extend(chunk)
+            
+            # Process when we have enough for a window (e.g., 500ms)
+            if len(temp_buffer) >= SAMPLE_RATE // 2 * 2:  # 500ms = 8000 samples (each 2 bytes)
+                # Convert to numpy array (int16) then to float32 for VAD
+                audio_int16 = np.frombuffer(temp_buffer, dtype=np.int16)
+                audio_float32 = audio_int16.astype(np.float32) / 32768.0
+                
+                # Get speech timestamps for this chunk
+                speech_ts = get_speech_timestamps(
+                    audio_float32, 
+                    model, 
+                    sampling_rate=SAMPLE_RATE,
+                    threshold=0.5,
+                    min_speech_duration_ms=int(MIN_SPEECH_DURATION * 1000),
+                    min_silence_duration_ms=int(MIN_SILENCE_DURATION * 1000)
+                )
+                
+                if speech_ts:
+                    # There is speech in this chunk
+                    if not is_speaking:
+                        # Speech just started
+                        is_speaking = True
+                        # Send interrupt signal to Redis (to stop TTS)
+                        await r.publish("interrupt", "1")
+                        print("Speech started, interrupt sent")
+                    
+                    # Add the whole chunk to the main buffer (we might want to cut exactly, but for simplicity)
+                    audio_buffer.extend(temp_buffer)
+                    temp_buffer.clear()
+                else:
+                    # No speech in this chunk
+                    if is_speaking:
+                        # Possible end of speech, but we should wait a bit before finalizing
+                        # For now, we'll just accumulate silence for a while, then send
+                        audio_buffer.extend(temp_buffer)
+                        temp_buffer.clear()
+                        
+                        # Check if silence duration exceeded threshold
+                        # We need to track time, but for simplicity we'll assume this chunk is silence and just continue
+                        # A real implementation would check accumulated silence length
+                        # Here we'll just keep adding, and later send when buffer stops growing for some time
+                        
+                        # However, for a simple prototype, we can just send when we detect end of speech
+                        # We'll use a timer: after speech, start a timer; if no new speech within 1 sec, send
+                        # We'll implement timer later. For now, we'll just send after 1 second of no speech
+                        # This requires asyncio.sleep and tracking.
+                        # Instead, we'll add a simple approach: after each chunk with no speech, if buffer not empty, check if we should send.
+                        # But to avoid complexity, we'll just send after a fixed silence threshold.
+                        # Let's implement a more robust approach using asyncio.create_task
+                        asyncio.create_task(flush_after_silence())
+                    
+                    else:
+                        # No speech and not speaking, just keep temp buffer but maybe drop if too long? We'll limit.
+                        if len(temp_buffer) > SAMPLE_RATE * 10:  # keep max 10 seconds non-speech
+                            temp_buffer.clear()
+            
+            # Limit main buffer size
+            if len(audio_buffer) > BUFFER_MAX_SAMPLES * 2:
+                audio_buffer = audio_buffer[-BUFFER_MAX_SAMPLES * 2:]
+                
     except Exception as e:
         print(f"VAD connection closed: {e}")
+    finally:
+        # Clean up if needed
+        pass
+
+async def flush_after_silence():
+    """Wait for silence duration then send audio to ASR."""
+    await asyncio.sleep(MIN_SILENCE_DURATION)
+    global audio_buffer, is_speaking
+    if audio_buffer and not is_speaking:
+        # Send to ASR
+        await send_to_asr(audio_buffer)
+        audio_buffer.clear()
+
+async def send_to_asr(audio_data: bytes):
+    """Send audio segment to ASR service and get transcription."""
+    # Prepare WAV file in memory
+    wav_io = io.BytesIO()
+    with wave.open(wav_io, 'wb') as wav:
+        wav.setnchannels(1)
+        wav.setsampwidth(2)  # 16-bit
+        wav.setframerate(SAMPLE_RATE)
+        wav.writeframes(audio_data)
+    wav_io.seek(0)
+    
+    try:
+        response = requests.post(
+            "http://jarvis-asr:8000/transcribe",
+            files={"audio": ("audio.wav", wav_io, "audio/wav")}
+        )
+        if response.status_code == 200:
+            text = response.json().get("text", "")
+            print(f"ASR result: {text}")
+            # Here you would send the text to orchestrator for further processing
+            # For now, just publish to Redis? We'll do later.
+    except Exception as e:
+        print(f"Error sending to ASR: {e}")
diff --git a/services/vad/requirements.txt b/services/vad/requirements.txt
index b0bf70d..e26e7cd 100644
--- a/services/vad/requirements.txt
+++ b/services/vad/requirements.txt
@@ -1,6 +1,8 @@
 ﻿fastapi==0.115.0
 uvicorn[standard]==0.30.1
-redis==5.0.4
+redis>=5.0.0
 pydantic==2.8.2
 python-dotenv==1.0.1
-# Позже добавим silero-vad, numpy, torch и т.д.
+numpy==1.26.4
+torch==2.5.1
+silero-vad==6.2.0