Another iteration from deepseek
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
from fastapi import FastAPI
|
from fastapi import FastAPI, File, UploadFile
|
||||||
import redis
|
import redis.asyncio as redis
|
||||||
import os
|
import os
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
@@ -12,4 +12,7 @@ r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
|
|||||||
async def health():
|
async def health():
|
||||||
return {"status": "ok", "service": "asr"}
|
return {"status": "ok", "service": "asr"}
|
||||||
|
|
||||||
# Позже здесь будут эндпоинты для приёма аудио и возврата текста
|
@app.post("/transcribe")
|
||||||
|
async def transcribe(audio: UploadFile = File(...)):
|
||||||
|
# Пока просто заглушка
|
||||||
|
return {"text": "тестовая транскрипция"}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
fastapi==0.115.0
|
fastapi==0.115.0
|
||||||
uvicorn[standard]==0.30.1
|
uvicorn[standard]==0.30.1
|
||||||
redis==5.0.4
|
redis>=5.0.0
|
||||||
pydantic==2.8.2
|
pydantic==2.8.2
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
fastapi==0.115.0
|
fastapi==0.115.0
|
||||||
uvicorn[standard]==0.30.1
|
uvicorn[standard]==0.30.1
|
||||||
redis==5.0.4
|
redis>=5.0.0
|
||||||
pydantic==2.8.2
|
pydantic==2.8.2
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
fastapi==0.115.0
|
fastapi==0.115.0
|
||||||
uvicorn[standard]==0.30.1
|
uvicorn[standard]==0.30.1
|
||||||
redis==5.0.4
|
redis>=5.0.0
|
||||||
pydantic==2.8.2
|
pydantic==2.8.2
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
fastapi==0.115.0
|
fastapi==0.115.0
|
||||||
uvicorn[standard]==0.30.1
|
uvicorn[standard]==0.30.1
|
||||||
redis==5.0.4
|
redis>=5.0.0
|
||||||
pydantic==2.8.2
|
pydantic==2.8.2
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
|
|||||||
@@ -1,26 +1,153 @@
|
|||||||
from fastapi import FastAPI
|
import asyncio
|
||||||
import redis
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import redis.asyncio as redis
|
||||||
|
from fastapi import FastAPI, WebSocket, HTTPException
|
||||||
import os
|
import os
|
||||||
|
import io
|
||||||
|
import wave
|
||||||
|
import requests
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
# Redis connection
|
||||||
redis_host = os.getenv("REDIS_HOST", "localhost")
|
redis_host = os.getenv("REDIS_HOST", "localhost")
|
||||||
redis_port = int(os.getenv("REDIS_PORT", 6379))
|
redis_port = int(os.getenv("REDIS_PORT", 6379))
|
||||||
r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
|
r = redis.Redis(host=redis_host, port=redis_port, decode_responses=True)
|
||||||
|
|
||||||
|
# Silero VAD model (load once)
|
||||||
|
print("Loading Silero VAD model...")
|
||||||
|
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
|
||||||
|
(get_speech_timestamps, _, _, _, _) = utils
|
||||||
|
print("VAD model loaded.")
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
SAMPLE_RATE = 16000
|
||||||
|
MIN_SPEECH_DURATION = 0.5 # seconds
|
||||||
|
MIN_SILENCE_DURATION = 0.5 # seconds
|
||||||
|
BUFFER_MAX_DURATION = 30 # max buffer size in seconds
|
||||||
|
BUFFER_MAX_SAMPLES = BUFFER_MAX_DURATION * SAMPLE_RATE
|
||||||
|
|
||||||
|
# Audio buffer for current speech segment
|
||||||
|
audio_buffer = bytearray()
|
||||||
|
|
||||||
|
# State
|
||||||
|
is_speaking = False
|
||||||
|
last_speech_end = 0
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health():
|
async def health():
|
||||||
return {"status": "ok", "service": "vad"}
|
return {"status": "ok", "service": "vad"}
|
||||||
|
|
||||||
@app.websocket("/audio-stream")
|
@app.websocket("/audio-stream")
|
||||||
async def audio_stream(websocket):
|
async def audio_stream(websocket: WebSocket):
|
||||||
|
global audio_buffer, is_speaking, last_speech_end
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
print("Client connected to VAD")
|
print("Client connected to VAD")
|
||||||
|
|
||||||
|
# We'll collect audio in a temporary buffer for VAD processing
|
||||||
|
temp_buffer = bytearray()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
data = await websocket.receive_bytes()
|
# Receive audio chunk (bytes, int16, 16kHz mono)
|
||||||
# Пока ничего не делаем с данными
|
chunk = await websocket.receive_bytes()
|
||||||
# В будущем здесь будет VAD-обработка
|
temp_buffer.extend(chunk)
|
||||||
pass
|
|
||||||
|
# Process when we have enough for a window (e.g., 500ms)
|
||||||
|
if len(temp_buffer) >= SAMPLE_RATE // 2 * 2: # 500ms = 8000 samples (each 2 bytes)
|
||||||
|
# Convert to numpy array (int16) then to float32 for VAD
|
||||||
|
audio_int16 = np.frombuffer(temp_buffer, dtype=np.int16)
|
||||||
|
audio_float32 = audio_int16.astype(np.float32) / 32768.0
|
||||||
|
|
||||||
|
# Get speech timestamps for this chunk
|
||||||
|
speech_ts = get_speech_timestamps(
|
||||||
|
audio_float32,
|
||||||
|
model,
|
||||||
|
sampling_rate=SAMPLE_RATE,
|
||||||
|
threshold=0.5,
|
||||||
|
min_speech_duration_ms=int(MIN_SPEECH_DURATION * 1000),
|
||||||
|
min_silence_duration_ms=int(MIN_SILENCE_DURATION * 1000)
|
||||||
|
)
|
||||||
|
|
||||||
|
if speech_ts:
|
||||||
|
# There is speech in this chunk
|
||||||
|
if not is_speaking:
|
||||||
|
# Speech just started
|
||||||
|
is_speaking = True
|
||||||
|
# Send interrupt signal to Redis (to stop TTS)
|
||||||
|
await r.publish("interrupt", "1")
|
||||||
|
print("Speech started, interrupt sent")
|
||||||
|
|
||||||
|
# Add the whole chunk to the main buffer (we might want to cut exactly, but for simplicity)
|
||||||
|
audio_buffer.extend(temp_buffer)
|
||||||
|
temp_buffer.clear()
|
||||||
|
else:
|
||||||
|
# No speech in this chunk
|
||||||
|
if is_speaking:
|
||||||
|
# Possible end of speech, but we should wait a bit before finalizing
|
||||||
|
# For now, we'll just accumulate silence for a while, then send
|
||||||
|
audio_buffer.extend(temp_buffer)
|
||||||
|
temp_buffer.clear()
|
||||||
|
|
||||||
|
# Check if silence duration exceeded threshold
|
||||||
|
# We need to track time, but for simplicity we'll assume this chunk is silence and just continue
|
||||||
|
# A real implementation would check accumulated silence length
|
||||||
|
# Here we'll just keep adding, and later send when buffer stops growing for some time
|
||||||
|
|
||||||
|
# However, for a simple prototype, we can just send when we detect end of speech
|
||||||
|
# We'll use a timer: after speech, start a timer; if no new speech within 1 sec, send
|
||||||
|
# We'll implement timer later. For now, we'll just send after 1 second of no speech
|
||||||
|
# This requires asyncio.sleep and tracking.
|
||||||
|
# Instead, we'll add a simple approach: after each chunk with no speech, if buffer not empty, check if we should send.
|
||||||
|
# But to avoid complexity, we'll just send after a fixed silence threshold.
|
||||||
|
# Let's implement a more robust approach using asyncio.create_task
|
||||||
|
asyncio.create_task(flush_after_silence())
|
||||||
|
|
||||||
|
else:
|
||||||
|
# No speech and not speaking, just keep temp buffer but maybe drop if too long? We'll limit.
|
||||||
|
if len(temp_buffer) > SAMPLE_RATE * 10: # keep max 10 seconds non-speech
|
||||||
|
temp_buffer.clear()
|
||||||
|
|
||||||
|
# Limit main buffer size
|
||||||
|
if len(audio_buffer) > BUFFER_MAX_SAMPLES * 2:
|
||||||
|
audio_buffer = audio_buffer[-BUFFER_MAX_SAMPLES * 2:]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"VAD connection closed: {e}")
|
print(f"VAD connection closed: {e}")
|
||||||
|
finally:
|
||||||
|
# Clean up if needed
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def flush_after_silence():
|
||||||
|
"""Wait for silence duration then send audio to ASR."""
|
||||||
|
await asyncio.sleep(MIN_SILENCE_DURATION)
|
||||||
|
global audio_buffer, is_speaking
|
||||||
|
if audio_buffer and not is_speaking:
|
||||||
|
# Send to ASR
|
||||||
|
await send_to_asr(audio_buffer)
|
||||||
|
audio_buffer.clear()
|
||||||
|
|
||||||
|
async def send_to_asr(audio_data: bytes):
|
||||||
|
"""Send audio segment to ASR service and get transcription."""
|
||||||
|
# Prepare WAV file in memory
|
||||||
|
wav_io = io.BytesIO()
|
||||||
|
with wave.open(wav_io, 'wb') as wav:
|
||||||
|
wav.setnchannels(1)
|
||||||
|
wav.setsampwidth(2) # 16-bit
|
||||||
|
wav.setframerate(SAMPLE_RATE)
|
||||||
|
wav.writeframes(audio_data)
|
||||||
|
wav_io.seek(0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
"http://jarvis-asr:8000/transcribe",
|
||||||
|
files={"audio": ("audio.wav", wav_io, "audio/wav")}
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
text = response.json().get("text", "")
|
||||||
|
print(f"ASR result: {text}")
|
||||||
|
# Here you would send the text to orchestrator for further processing
|
||||||
|
# For now, just publish to Redis? We'll do later.
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error sending to ASR: {e}")
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
fastapi==0.115.0
|
fastapi==0.115.0
|
||||||
uvicorn[standard]==0.30.1
|
uvicorn[standard]==0.30.1
|
||||||
redis==5.0.4
|
redis>=5.0.0
|
||||||
pydantic==2.8.2
|
pydantic==2.8.2
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
# Позже добавим silero-vad, numpy, torch и т.д.
|
numpy==1.26.4
|
||||||
|
torch==2.5.1
|
||||||
|
silero-vad==6.2.0
|
||||||
|
|||||||
Reference in New Issue
Block a user