Compare commits
4 Commits
local_voic
...
dev_debian
| Author | SHA1 | Date | |
|---|---|---|---|
| b44bd350f6 | |||
| 082c055683 | |||
| cd239fde3c | |||
| 095d3edc03 |
41
jarvis.py
41
jarvis.py
@@ -5,6 +5,7 @@ import asyncio
|
||||
import openai
|
||||
import sys
|
||||
import subprocess
|
||||
import edge_tts
|
||||
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
@@ -544,28 +545,42 @@ async def speak_to_user(text):
|
||||
print(text)
|
||||
print(f"{JARVIS_COLOR}{'-'*60}{RESET}\n")
|
||||
|
||||
clean_text = re.sub(r'[^\w\s\d.,!?-]', '', text)
|
||||
"""Generiert eine hochauflösende KI-Stimme via Edge-TTS und spielt sie ab."""
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
piper_path = "/home/meik/jarvis-ai/piper/piper"
|
||||
model_path = "/home/meik/jarvis-ai/de_DE-thorsten-high.onnx"
|
||||
lock_file = Path("/tmp/.jarvis_speaking") # Die Sperr-Datei
|
||||
# Definition der Stimme (Killian und Conrad sind hervorragende deutsche Männerstimmen)
|
||||
VOICE = "de-DE-KillianNeural"
|
||||
OUTPUT_FILE = "/tmp/jarvis_response.mp3"
|
||||
LOCK_FILE = "/tmp/.jarvis_speaking"
|
||||
|
||||
if os.path.exists(piper_path) and os.path.exists(model_path):
|
||||
try:
|
||||
# 1. Sperre setzen
|
||||
lock_file.touch()
|
||||
# 1. Erstelle die Lock-Datei, damit das Mikrofon im Wakeword-Skript stummschaltet
|
||||
with open(LOCK_FILE, "w") as f:
|
||||
f.write("1")
|
||||
|
||||
piper_cmd = f"echo '{clean_text}' | {piper_path} --model {model_path} --output_raw | aplay -r 22050 -f S16_LE -t raw -D pipewire >/dev/null 2>&1"
|
||||
print(f"🔊 J.A.R.V.I.S. spricht: {text}")
|
||||
|
||||
proc = await asyncio.create_subprocess_shell(piper_cmd)
|
||||
# 2. Audio aus der Cloud abrufen (Jetzt sauber mit direktem await!)
|
||||
communicate = edge_tts.Communicate(text, VOICE)
|
||||
await communicate.save(OUTPUT_FILE)
|
||||
|
||||
# 3. Audio ressourcenschonend & asynchron abspielen
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"mpv", "--no-video", OUTPUT_FILE,
|
||||
stdout=asyncio.subprocess.DEVNULL,
|
||||
stderr=asyncio.subprocess.DEVNULL
|
||||
)
|
||||
# Warten, bis mpv fertig gesprochen hat
|
||||
await proc.wait()
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ TTS Fehler: {e}")
|
||||
print(f"❌ Fehler bei der Sprachausgabe: {e}")
|
||||
|
||||
finally:
|
||||
# 2. Sperre IMMER wieder aufheben, wenn Piper fertig ist
|
||||
if lock_file.exists():
|
||||
lock_file.unlink()
|
||||
# 4. Lock-Datei IMMER löschen, damit J.A.R.V.I.S. wieder zuhört
|
||||
if os.path.exists(LOCK_FILE):
|
||||
os.remove(LOCK_FILE)
|
||||
|
||||
# ====================================================
|
||||
# MAIN LOOP
|
||||
|
||||
89
wakeword.py
89
wakeword.py
@@ -5,11 +5,21 @@ import json
|
||||
import queue
|
||||
import time
|
||||
import subprocess
|
||||
import colorama
|
||||
import wave
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
from vosk import Model, KaldiRecognizer
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
|
||||
# ====================================================
|
||||
# PFADE & ENV SETUP (Aus config/.env lesen)
|
||||
# ====================================================
|
||||
BASE_DIR = Path(__file__).resolve().parent
|
||||
CONFIG_DIR = BASE_DIR / "config"
|
||||
ENV_FILE = CONFIG_DIR / ".env"
|
||||
load_dotenv(ENV_FILE)
|
||||
|
||||
MODEL_PATH = "model"
|
||||
AUDIO_RATE = 48000
|
||||
@@ -19,6 +29,11 @@ if not os.path.exists(MODEL_PATH):
|
||||
print(f"❌ Modell-Ordner '{MODEL_PATH}' wurde nicht gefunden!")
|
||||
sys.exit(1)
|
||||
|
||||
# OpenAI Client initialisieren
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
print("⚠️ Warnung: Kein OPENAI_API_KEY in der .env gefunden!")
|
||||
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
audio_queue = queue.Queue()
|
||||
|
||||
def audio_callback(indata, frames, time, status):
|
||||
@@ -26,14 +41,13 @@ def audio_callback(indata, frames, time, status):
|
||||
print(status, file=sys.stderr)
|
||||
audio_queue.put(bytes(indata))
|
||||
|
||||
print("🧠 J.A.R.V.I.S. lädt das Sprachmodell...")
|
||||
print("🧠 J.A.R.V.I.S. lädt das Sprachmodell für das Wake-Word...")
|
||||
model = Model(MODEL_PATH)
|
||||
|
||||
# Zwei Recognizer: Einer für das Wake-Word, einer für den eigentlichen Befehl (offen)
|
||||
# Nur noch EIN Recognizer: Ausschließlich für das Wake-Word ("jarvis")
|
||||
wake_recognizer = KaldiRecognizer(model, AUDIO_RATE, '["jarvis", "[unk]"]')
|
||||
command_recognizer = KaldiRecognizer(model, AUDIO_RATE) # Sucht nach JEDEM deutschen Wort
|
||||
|
||||
print("🎙️ J.A.R.V.I.S. ist online und lauscht... (Sag 'Jarvis')")
|
||||
print("🎙️ J.A.R.V.I.S. läuft im Hybrid-Modus (Vosk + Whisper) und lauscht... (Sag 'Jarvis')")
|
||||
|
||||
with sd.RawInputStream(samplerate=AUDIO_RATE, blocksize=8000, dtype='int16',
|
||||
channels=1, callback=audio_callback):
|
||||
@@ -41,53 +55,75 @@ with sd.RawInputStream(samplerate=AUDIO_RATE, blocksize=8000, dtype='int16',
|
||||
while True:
|
||||
data = audio_queue.get()
|
||||
|
||||
# NEU: Wenn J.A.R.V.I.S. gerade spricht, leere die Queue und ignoriere das Audio
|
||||
# Wenn J.A.R.V.I.S. gerade spricht, leere die Queue und ignoriere das Audio
|
||||
if LOCK_FILE.exists():
|
||||
while not audio_queue.empty():
|
||||
audio_queue.get()
|
||||
wake_recognizer.Reset() # Verhindert, dass Bruchstücke von vorhin gespeichert bleiben
|
||||
wake_recognizer.Reset() # Verhindert alte Bruchstücke
|
||||
continue
|
||||
|
||||
# Phase 1: Auf Wake-Word warten
|
||||
# Phase 1: Auf Wake-Word warten (Lokal via Vosk)
|
||||
if wake_recognizer.AcceptWaveform(data):
|
||||
result = json.loads(wake_recognizer.Result())
|
||||
if "jarvis" in result.get("text", ""):
|
||||
print("\n⚡ [WAKEWORD DETECTED] Ja, Sir?")
|
||||
|
||||
# Bestätigungston abspielen
|
||||
# Kurzer, smarter Beep-Ton (800 Hz, 0.1 Sekunden)
|
||||
# Bestätigungston abspielen (800 Hz, 0.1 Sekunden)
|
||||
duration = 0.1
|
||||
frequency = 800.0
|
||||
t = np.linspace(0, duration, int(AUDIO_RATE * duration), endpoint=False)
|
||||
beep = np.sin(2 * np.pi * frequency * t) * 0.3 # 0.3 für angenehme Lautstärke
|
||||
beep = np.sin(2 * np.pi * frequency * t) * 0.3
|
||||
sd.play(beep, samplerate=AUDIO_RATE)
|
||||
sd.wait()
|
||||
# Warteschlange leeren, um alten Ton nicht als Befehl zu interpretieren
|
||||
|
||||
# Warteschlange leeren, um den Beep nicht selbst aufzunehmen
|
||||
while not audio_queue.empty():
|
||||
audio_queue.get()
|
||||
|
||||
print("👂 Höre zu...")
|
||||
command_text = ""
|
||||
print("👂 Höre zu (Befehlsaufnahme)...")
|
||||
collected_chunks = []
|
||||
start_time = time.time()
|
||||
|
||||
# Phase 2: Für 4 Sekunden den darauffolgenden Befehl aufnehmen
|
||||
# Phase 2: Für 4 Sekunden die Rohdaten aus dem Stream sammeln
|
||||
while time.time() - start_time < 4.0:
|
||||
cmd_data = audio_queue.get()
|
||||
if command_recognizer.AcceptWaveform(cmd_data):
|
||||
res = json.loads(command_recognizer.Result())
|
||||
command_text += " " + res.get("text", "")
|
||||
try:
|
||||
# Kurzer Timeout, damit die Schleife agil bleibt
|
||||
cmd_data = audio_queue.get(timeout=0.2)
|
||||
collected_chunks.append(cmd_data)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
# Letzten Rest auslesen
|
||||
final_res = json.loads(command_recognizer.FinalResult())
|
||||
command_text += " " + final_res.get("text", "")
|
||||
command_text = command_text.strip()
|
||||
print("🧠 Sende Audio an OpenAI Whisper API...")
|
||||
|
||||
# Rohe Audio-Bytes zusammenfügen und als WAV speichern
|
||||
wav_path = "/tmp/jarvis_cmd.wav"
|
||||
all_bytes = b"".join(collected_chunks)
|
||||
|
||||
try:
|
||||
with wave.open(wav_path, "wb") as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2) # int16 entspricht 2 Bytes
|
||||
wf.setframerate(AUDIO_RATE)
|
||||
wf.writeframes(all_bytes)
|
||||
|
||||
# Whisper API aufrufen
|
||||
with open(wav_path, "rb") as audio_file:
|
||||
transcription = openai_client.audio.transcriptions.create(
|
||||
model="whisper-1",
|
||||
file=audio_file,
|
||||
language="de" # Erzwingt deutsche Texterkennung
|
||||
)
|
||||
command_text = transcription.text.strip()
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Fehler bei der Spracherkennung: {e}")
|
||||
command_text = ""
|
||||
|
||||
# Phase 3: Befehl verarbeiten, falls Whisper etwas verstanden hat
|
||||
if command_text:
|
||||
print(f"🗣️ Erkannter Befehl: '{command_text}'")
|
||||
print(f"🗣️ Erkannt (Whisper): '{command_text}'")
|
||||
print("🧠 Übermittle an J.A.R.V.I.S. Gehirn...")
|
||||
|
||||
# Rufe jarvis.py im virtuellen Environment auf und übergib den Befehl
|
||||
# (Wir nutzen hier Google Gemini oder was auch immer in deiner .env aktiv ist!)
|
||||
subprocess.run([
|
||||
"venv/bin/python3",
|
||||
"jarvis.py",
|
||||
@@ -99,4 +135,3 @@ with sd.RawInputStream(samplerate=AUDIO_RATE, blocksize=8000, dtype='int16',
|
||||
|
||||
print("\n🎙️ Zurück im Standby. Lausche auf 'Jarvis'...")
|
||||
wake_recognizer.Reset()
|
||||
command_recognizer.Reset()
|
||||
|
||||
Reference in New Issue
Block a user