在量化金融领域,期权数据是构建交易策略的重要基础。传统的手动下载方式不仅效率低下,还容易出错。本文将带你用Python构建一个完整的期权数据自动化管道,涵盖数据获取、清洗、存储全流程。
构建自动化数据管道的第一步是选择合适的工具链。我们需要的核心组件包括:
安装所需库:
bash复制pip install akshare pandas requests sqlalchemy
对于需要处理三大交易所(上交所、深交所、中金所)的数据,每个交易所的API特点不同:
| 交易所 | 数据格式 | 认证要求 | 更新频率 |
|---|---|---|---|
| 上交所 | CSV | 需要Referer | 交易日收盘后1小时 |
| 深交所 | Excel | 无 | 交易日收盘后30分钟 |
| 中金所 | XML | 无 | 交易日收盘后45分钟 |
提示:上交所API需要设置HTTP头部的Referer字段,否则会返回403错误
可靠的交易日历是自动化采集的基础。Akshare提供了多种交易日历接口:
python复制import akshare as ak
# 获取新浪财经的交易日历
trade_dates = ak.tool_trade_date_hist_sina()
print(trade_dates.head())
处理交易日历时需要注意几个关键点:
优化后的日期处理代码:
python复制def format_date_for_exchange(date_str, exchange):
"""根据不同交易所要求格式化日期"""
dt = pd.to_datetime(date_str)
if exchange == 'SSE': # 上交所
return dt.strftime('%Y%m%d')
elif exchange == 'SZSE': # 深交所
return dt.strftime('%Y-%m-%d')
elif exchange == 'CFFEX': # 中金所
return {'YM': dt.strftime('%Y%m'), 'D': dt.strftime('%d')}
else:
raise ValueError(f"未知交易所: {exchange}")
上交所提供CSV格式的期权风险指标数据,需要特别注意HTTP头设置:
python复制def fetch_sse_data(date_str):
headers = {
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0'
}
formatted_date = format_date_for_exchange(date_str, 'SSE')
url = f'http://query.sse.com.cn/derivative/downloadRisk.do?trade_date={formatted_date}&productType=0'
try:
response = requests.get(url, headers=headers)
response.encoding = 'gbk' # 上交所CSV使用GBK编码
df = pd.read_csv(StringIO(response.text))
df['trade_date'] = date_str # 添加采集日期字段
return df
except Exception as e:
print(f"获取上交所数据失败({date_str}): {str(e)}")
return None
深交所数据以Excel格式提供,处理时需要注意:
python复制def fetch_szse_data(date_str):
formatted_date = format_date_for_exchange(date_str, 'SZSE')
url = f'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=option_hyfxzb&TABKEY=tab1&txtSearchDate={formatted_date}'
try:
response = requests.get(url)
# 使用BytesIO避免临时文件
df = pd.read_excel(BytesIO(response.content), engine='openpyxl')
df['trade_date'] = date_str
return df
except Exception as e:
print(f"获取深交所数据失败({date_str}): {str(e)}")
return None
中金所使用XML格式,需要用XPath解析:
python复制def fetch_cffex_data(date_str):
date_parts = format_date_for_exchange(date_str, 'CFFEX')
url = f'http://www.cffex.com.cn/sj/hqsj/rtj/{date_parts["YM"]}/{date_parts["D"]}/index.xml?id=39'
try:
response = requests.get(url)
root = ET.fromstring(response.content)
data = []
for daily_data in root.findall('dailydata'):
record = {child.tag: child.text for child in daily_data}
record['trade_date'] = date_str
data.append(record)
return pd.DataFrame(data)
except Exception as e:
print(f"获取中金所数据失败({date_str}): {str(e)}")
return None
不同交易所的数据字段差异很大,需要统一标准化:
python复制def standardize_data(df, exchange):
"""将不同交易所的数据标准化为统一格式"""
standardized = pd.DataFrame()
# 公共字段
standardized['trade_date'] = df['trade_date']
standardized['exchange'] = exchange
# 合约代码标准化
if exchange == 'SSE':
standardized['contract_code'] = df['合约编码']
standardized['underlying'] = df['标的证券代码']
elif exchange == 'SZSE':
standardized['contract_code'] = df['合约代码']
standardized['underlying'] = df['标的证券代码']
elif exchange == 'CFFEX':
standardized['contract_code'] = df['instrumentid']
standardized['underlying'] = df['productid']
# 价格字段处理
price_columns = ['open', 'high', 'low', 'close', 'settlement']
for col in price_columns:
if exchange == 'SSE':
standardized[col] = df[f'{col}价']
elif exchange == 'SZSE':
standardized[col] = df[f'{col}价格']
elif exchange == 'CFFEX':
standardized[col] = df[f'{col}price']
return standardized
使用SQLAlchemy创建统一的数据存储结构:
python复制from sqlalchemy import create_engine, Column, String, Date, Float
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class OptionData(Base):
__tablename__ = 'option_data'
id = Column(String(50), primary_key=True) # exchange+contract_code+date
trade_date = Column(Date)
exchange = Column(String(10))
contract_code = Column(String(20))
underlying = Column(String(10))
open = Column(Float)
high = Column(Float)
low = Column(Float)
close = Column(Float)
settlement = Column(Float)
volume = Column(Float)
open_interest = Column(Float)
def __init__(self, **kwargs):
self.id = f"{kwargs['exchange']}_{kwargs['contract_code']}_{kwargs['trade_date']}"
for key, value in kwargs.items():
setattr(self, key, value)
# 初始化SQLite数据库
engine = create_engine('sqlite:///option_data.db')
Base.metadata.create_all(engine)
数据存储函数:
python复制from sqlalchemy.orm import sessionmaker
def save_to_db(df, exchange):
Session = sessionmaker(bind=engine)
session = Session()
try:
for _, row in df.iterrows():
data_dict = row.to_dict()
data_dict['exchange'] = exchange
option_data = OptionData(**data_dict)
session.merge(option_data) # 使用merge实现upsert操作
session.commit()
except Exception as e:
session.rollback()
print(f"保存{exchange}数据失败: {str(e)}")
finally:
session.close()
将各组件组合成完整的工作流:
python复制def run_pipeline(start_date, end_date):
# 获取交易日历
trade_dates = ak.tool_trade_date_hist_sina()
date_range = trade_dates[(trade_dates['trade_date'] >= start_date) &
(trade_dates['trade_date'] <= end_date)]
for date_str in date_range['trade_date']:
print(f"处理日期: {date_str}")
# 上交所数据
sse_data = fetch_sse_data(date_str)
if sse_data is not None:
sse_std = standardize_data(sse_data, 'SSE')
save_to_db(sse_std, 'SSE')
# 深交所数据
szse_data = fetch_szse_data(date_str)
if szse_data is not None:
szse_std = standardize_data(szse_data, 'SZSE')
save_to_db(szse_std, 'SZSE')
# 中金所数据
cffex_data = fetch_cffex_data(date_str)
if cffex_data is not None:
cffex_std = standardize_data(cffex_data, 'CFFEX')
save_to_db(cffex_std, 'CFFEX')
time.sleep(1) # 礼貌性延迟
if __name__ == '__main__':
run_pipeline('2023-01-01', '2023-12-31')
健壮的数据管道需要完善的错误处理机制:
python复制import logging
from datetime import datetime
# 配置日志
logging.basicConfig(
filename=f'option_pipeline_{datetime.now().strftime("%Y%m%d")}.log',
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def log_error(exchange, date_str, error):
"""记录错误日志"""
error_msg = f"{exchange}数据采集失败({date_str}): {str(error)}"
logging.error(error_msg)
print(error_msg)
# 在fetch函数中添加错误日志记录
def fetch_sse_data(date_str):
try:
# ...原有代码...
except Exception as e:
log_error('SSE', date_str, e)
return None
随着数据量增加,需要考虑性能优化:
增量更新实现示例:
python复制def get_last_success_date():
"""从数据库查询最后成功采集的日期"""
with engine.connect() as conn:
result = conn.execute("SELECT MAX(trade_date) FROM option_data")
last_date = result.scalar()
return pd.to_datetime(last_date) if last_date else None
def run_pipeline(start_date=None, end_date=None):
# 如果没有指定开始日期,则从最后成功日期+1天开始
if start_date is None:
last_date = get_last_success_date()
start_date = (last_date + pd.Timedelta(days=1)).strftime('%Y-%m-%d') if last_date else '2019-01-01'
end_date = end_date or datetime.now().strftime('%Y-%m-%d')
# ...其余代码不变...
自动化管道需要内置数据质量检查:
python复制def validate_data(df, exchange):
"""验证数据质量"""
if df.empty:
raise ValueError(f"{exchange}数据为空")
# 检查必要字段
required_columns = {
'SSE': ['合约编码', '标的证券代码', '开盘价', '最高价'],
'SZSE': ['合约代码', '标的证券代码', '开盘价格', '最高价格'],
'CFFEX': ['instrumentid', 'productid', 'openprice', 'highestprice']
}
missing_cols = [col for col in required_columns[exchange] if col not in df.columns]
if missing_cols:
raise ValueError(f"{exchange}数据缺少必要列: {missing_cols}")
# 检查价格合理性
price_cols = [col for col in df.columns if 'price' in col.lower() or '价' in col]
for col in price_cols:
if df[col].isnull().mean() > 0.1: # 超过10%为空
raise ValueError(f"{exchange}数据中{col}列缺失值过多")
return True
将数据管道部署为定时任务:
Dockerfile示例:
dockerfile复制FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["python", "pipeline.py"]
使用cron设置每日定时任务:
bash复制# 每天下午6点运行
0 18 * * * docker run --rm my-option-pipeline
采集的数据可以用于多种分析:
python复制def analyze_option_data(start_date, end_date):
query = f"""
SELECT trade_date, exchange, COUNT(*) as contract_count,
AVG(close) as avg_close, SUM(volume) as total_volume
FROM option_data
WHERE trade_date BETWEEN '{start_date}' AND '{end_date}'
GROUP BY trade_date, exchange
ORDER BY trade_date
"""
with engine.connect() as conn:
df = pd.read_sql(query, conn)
# 简单的可视化
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
for exchange in df['exchange'].unique():
subset = df[df['exchange'] == exchange]
plt.plot(subset['trade_date'], subset['total_volume'], label=exchange)
plt.title('各交易所期权成交量趋势')
plt.xlabel('日期')
plt.ylabel('成交量')
plt.legend()
plt.grid()
plt.show()
在实际项目中,这套系统已经稳定运行了一年多,每天自动采集三大交易所的期权数据。最大的收获是建立了可靠的历史数据库,为后续的策略回测提供了坚实基础。几点经验分享: