from flask import Flask, request, jsonify, send_from_directory from flask_cors import CORS import time import os from transformers import AutoTokenizer, AutoModelForCausalLM from funasr import AutoModel import edge_tts import asyncio import langid import tempfile app = Flask(__name__) CORS(app) # 配置参数 AUDIO_RATE = 16000 OUTPUT_DIR = "./output" os.makedirs(OUTPUT_DIR, exist_ok=True) # 初始化模型 print("Loading models...") model_dir = "D:/AI/download/SenseVoiceSmall" model_senceVoice = AutoModel(model=model_dir, trust_remote_code=True) # 加载Qwen2.5大语言模型 model_name = "D:/AI/download/Qwen2.5-1.5B-Instruct" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) print("All models loaded!") # 语言映射 language_speaker = { "ja": "ja-JP-NanamiNeural", "fr": "fr-FR-DeniseNeural", "es": "ca-ES-JoanaNeural", "de": "de-DE-KatjaNeural", "zh": "zh-CN-XiaoyiNeural", "en": "en-US-AnaNeural", } # 全局变量 def process_audio(input_path): # 语音识别 res = model_senceVoice.generate( input=input_path, cache={}, language="auto", use_itn=False, ) prompt = res[0]['text'].split(">")[-1] print("ASR OUT:", prompt) # 大模型处理 messages = [ {"role": "system", "content": "你叫千问,是一个18岁的女大学生,性格活泼开朗,说话俏皮"}, {"role": "user", "content": prompt}, ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate(**model_inputs, max_new_tokens=512) generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print("Answer:", output_text) # 语种识别 lang, _ = langid.classify(output_text) speaker = language_speaker.get(lang, "zh-CN-XiaoyiNeural") # 语音合成 output_file = os.path.join(OUTPUT_DIR, f"response_{int(time.time())}.mp3") asyncio.run(edge_tts.Communicate(output_text, speaker).save(output_file)) return output_file # 新增ASR专用接口 @app.route('/asr', methods=['POST']) def handle_asr(): if 'audio' not in request.files: return jsonify({"error": "No audio file provided"}), 400 try: audio_file = request.files['audio'] with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: audio_file.save(tmp.name) # 语音识别 res = model_senceVoice.generate( input=tmp.name, cache={}, language="auto", use_itn=False, ) asr_text = res[0]['text'].split(">")[-1] return jsonify({ "asr_text": asr_text }) except Exception as e: return jsonify({"error": str(e)}), 500 # 大模型处理接口 @app.route('/generate', methods=['POST']) def handle_generate(): try: data = request.get_json() asr_text = data.get('asr_text', '') if not asr_text: return jsonify({"error": "No ASR text provided"}), 400 # 大模型处理 messages = [ {"role": "system", "content": "你叫千问,是一个18岁的女大学生,性格活泼开朗,说话俏皮"}, {"role": "user", "content": asr_text}, ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate(**model_inputs, max_new_tokens=512) generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] answer_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] lang, _ = langid.classify(answer_text) speaker = language_speaker.get(lang, "zh-CN-XiaoyiNeural") output_file = os.path.join(OUTPUT_DIR, f"response_{int(time.time())}.mp3") asyncio.run(edge_tts.Communicate(answer_text, speaker).save(output_file)) return jsonify({ "answer_text": answer_text, "audio_url": f"/audio/{os.path.basename(output_file)}" }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route('/audio/') def get_audio(filename): return send_from_directory(OUTPUT_DIR, filename) if __name__ == '__main__': app.run(port=5000, threaded=True)