在AI技术快速发展的今天,大模型能力正从云端逐步走向本地。对于开发者而言,将大模型部署在本地不仅能避免API调用限制和隐私泄露风险,还能实现更灵活的定制化开发。本文将带你一步步实现一个基于Ollama和Flask的本地大模型问答服务,从环境配置到API设计,再到性能优化,提供一套完整的解决方案。
在开始之前,确保你的开发环境满足以下基本要求:
硬件配置:
软件环境:
Ollama是一个轻量级的本地大模型运行框架,支持多种量化模型格式。安装过程非常简单:
bash复制# Windows用户下载安装包
curl -L https://ollama.ai/download/OllamaSetup.exe -o OllamaSetup.exe
./OllamaSetup.exe
# Linux用户使用命令行安装
curl -fsSL https://ollama.ai/install.sh | sh
模型导入是本地部署的关键步骤。以WizardCoder-Python-13B模型为例:
python复制# 创建Modelfile
with open('Modelfile', 'w') as f:
f.write('FROM ./wizardcoder-python-13b-v1.0.Q5_K_M.gguf')
# 创建模型
ollama create wizardcoder-13b -f Modelfile
安装完成后,可以通过简单的Python代码测试模型是否正常运行:
python复制import ollama
response = ollama.chat(
model='wizardcoder-13b',
messages=[{'role': 'user', 'content': '用Python实现快速排序算法'}]
)
print(response['message']['content'])
一个良好的项目结构能大大提高代码的可维护性。建议采用以下目录结构:
code复制/local_llm_api/
│── app.py # 主应用入口
│── requirements.txt # 依赖列表
│── config.py # 配置文件
│── utils/ # 工具函数
│ └── model_utils.py
│── tests/ # 测试代码
│ └── test_api.py
Flask的路由设计需要考虑RESTful原则和实际使用场景。以下是基础的API设计:
python复制from flask import Flask, request, jsonify
import ollama
app = Flask(__name__)
@app.route('/v1/chat/completions', methods=['POST'])
def chat_completion():
data = request.get_json()
# 参数校验
if not data or 'messages' not in data:
return jsonify({'error': 'Invalid request format'}), 400
try:
response = ollama.chat(
model='wizardcoder-13b',
messages=data['messages'],
stream=data.get('stream', False)
)
return jsonify(response)
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
为了与主流AI API保持兼容,建议采用以下JSON格式:
请求示例:
json复制{
"model": "wizardcoder-13b",
"messages": [
{"role": "system", "content": "你是一个有帮助的AI助手"},
{"role": "user", "content": "如何用Python读取CSV文件?"}
],
"temperature": 0.7,
"max_tokens": 1000,
"stream": false
}
响应示例:
json复制{
"id": "chatcmpl-123",
"object": "chat.completion",
"created": 1677652288,
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": "可以使用Python的csv模块..."
},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": 56,
"completion_tokens": 31,
"total_tokens": 87
}
}
流式输出能显著提升用户体验,特别是在处理长文本生成时。Flask通过生成器函数实现流式响应:
python复制from flask import Response, stream_with_context
@app.route('/v1/chat/completions', methods=['POST'])
def chat_completion():
data = request.get_json()
stream = data.get('stream', False)
if stream:
def generate():
response = ollama.chat(
model='wizardcoder-13b',
messages=data['messages'],
stream=True
)
for chunk in response:
yield f"data: {json.dumps(chunk)}\n\n"
yield "data: [DONE]\n\n"
return Response(stream_with_context(generate()), mimetype='text/event-stream')
else:
# 非流式处理逻辑
...
在实际应用中,可能需要同时支持多个模型。可以通过动态加载机制实现:
python复制models = {
'wizardcoder-13b': {
'path': './models/wizardcoder-13b',
'loaded': False
},
'llama2-7b': {
'path': './models/llama2-7b',
'loaded': False
}
}
@app.route('/v1/models', methods=['GET'])
def list_models():
return jsonify({'data': [{'id': k} for k in models.keys()]})
@app.route('/v1/models/<model_id>/load', methods=['POST'])
def load_model(model_id):
if model_id not in models:
return jsonify({'error': 'Model not found'}), 404
try:
ollama.pull(models[model_id]['path'])
models[model_id]['loaded'] = True
return jsonify({'status': 'success'})
except Exception as e:
return jsonify({'error': str(e)}), 500
本地大模型服务性能是关键考量。以下是一些有效的优化手段:
内存管理:
python复制import gc
@app.after_request
def clean_up(response):
gc.collect() # 主动触发垃圾回收
return response
请求批处理:
python复制@app.route('/v1/batch/chat', methods=['POST'])
def batch_chat():
requests = request.get_json()
results = []
for req in requests:
try:
response = ollama.chat(
model=req.get('model', 'wizardcoder-13b'),
messages=req['messages']
)
results.append(response)
except Exception as e:
results.append({'error': str(e)})
return jsonify({'data': results})
缓存机制:
python复制from flask_caching import Cache
cache = Cache(config={'CACHE_TYPE': 'SimpleCache'})
cache.init_app(app)
@cache.memoize(timeout=300)
def get_cached_response(prompt):
return ollama.chat(
model='wizardcoder-13b',
messages=[{'role': 'user', 'content': prompt}]
)
对外开放API服务必须考虑安全性:
python复制from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
limiter = Limiter(
app=app,
key_func=get_remote_address,
default_limits=["100 per minute"]
)
@app.route('/v1/chat/completions', methods=['POST'])
@limiter.limit("10/minute") # 更严格的限制
def chat_completion():
...
API密钥验证中间件:
python复制from functools import wraps
def require_api_key(f):
@wraps(f)
def decorated(*args, **kwargs):
api_key = request.headers.get('X-API-KEY')
if api_key != os.getenv('API_KEY'):
return jsonify({'error': 'Invalid API key'}), 403
return f(*args, **kwargs)
return decorated
@app.route('/v1/chat/completions', methods=['POST'])
@require_api_key
def chat_completion():
...
完善的监控系统能帮助及时发现和解决问题:
python复制import logging
from prometheus_flask_exporter import PrometheusMetrics
# 设置日志
logging.basicConfig(
filename='api.log',
level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s'
)
# Prometheus监控
metrics = PrometheusMetrics(app)
metrics.info('app_info', 'Local LLM API Info', version='1.0')
@app.after_request
def log_response(response):
app.logger.info(
f"{request.remote_addr} {request.method} {request.path} "
f"{response.status_code} {response.content_length}bytes"
)
return response
使用Docker可以简化部署流程并提高可移植性:
dockerfile复制# Dockerfile
FROM python:3.9-slim
WORKDIR /app
COPY . .
RUN pip install --no-cache-dir -r requirements.txt
ENV FLASK_APP=app.py
ENV FLASK_ENV=production
EXPOSE 5000
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "app:app"]
对应的docker-compose.yml配置:
yaml复制version: '3'
services:
llm-api:
build: .
ports:
- "5000:5000"
environment:
- API_KEY=${API_KEY}
deploy:
resources:
limits:
memory: 16G
volumes:
- ./models:/app/models
以下是一个可直接使用的完整Flask应用示例:
python复制# app.py
import os
from functools import wraps
from flask import Flask, request, jsonify, Response, stream_with_context
import ollama
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
import logging
from prometheus_flask_exporter import PrometheusMetrics
app = Flask(__name__)
limiter = Limiter(app=app, key_func=get_remote_address)
metrics = PrometheusMetrics(app)
# 配置
MODEL_NAME = os.getenv('MODEL_NAME', 'wizardcoder-13b')
API_KEY = os.getenv('API_KEY')
# 日志配置
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s'
)
def require_api_key(f):
@wraps(f)
def decorated(*args, **kwargs):
if API_KEY and request.headers.get('X-API-KEY') != API_KEY:
return jsonify({'error': 'Invalid API key'}), 403
return f(*args, **kwargs)
return decorated
@app.route('/v1/models', methods=['GET'])
@require_api_key
def list_models():
return jsonify({
'data': [{
'id': MODEL_NAME,
'object': 'model',
'created': 1686935002,
'owned_by': 'user'
}]
})
@app.route('/v1/chat/completions', methods=['POST'])
@require_api_key
@limiter.limit("60/minute")
def chat_completion():
data = request.get_json()
if not data or 'messages' not in data:
return jsonify({'error': 'messages field is required'}), 400
stream = data.get('stream', False)
if stream:
def generate():
response = ollama.chat(
model=MODEL_NAME,
messages=data['messages'],
stream=True
)
for chunk in response:
yield f"data: {json.dumps({
'id': 'chatcmpl-' + str(uuid.uuid4()),
'object': 'chat.completion.chunk',
'created': int(time.time()),
'model': MODEL_NAME,
'choices': [{
'index': 0,
'delta': {'content': chunk['message']['content']},
'finish_reason': None
}]
})}\n\n"
yield "data: [DONE]\n\n"
return Response(stream_with_context(generate()), mimetype='text/event-stream')
else:
try:
response = ollama.chat(
model=MODEL_NAME,
messages=data['messages']
)
return jsonify({
'id': 'chatcmpl-' + str(uuid.uuid4()),
'object': 'chat.completion',
'created': int(time.time()),
'model': MODEL_NAME,
'choices': [{
'index': 0,
'message': response['message'],
'finish_reason': 'stop'
}],
'usage': {
'prompt_tokens': len(data['messages'][-1]['content']),
'completion_tokens': len(response['message']['content']),
'total_tokens': len(data['messages'][-1]['content']) + len(response['message']['content'])
}
})
except Exception as e:
app.logger.error(f"Error in chat completion: {str(e)}")
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
配套的requirements.txt文件:
code复制flask==2.3.2
ollama==0.1.2
flask-limiter==3.3.0
prometheus-flask-exporter==0.22.4
gunicorn==20.1.0