在数字化转型浪潮中,自动化数据采集与处理能力已成为企业降本增效的核心竞争力。影刀RPA作为国内领先的流程自动化工具,其高级认证考试中的爬虫与数据库集成题目,正是对开发者实战能力的全面检验。本文将深入剖析一个典型场景:从带反爬机制的票房网站抓取结构化数据,并安全存储至MySQL数据库的完整解决方案。
开始实战前,需要确保开发环境满足以下条件:
bash复制pip install pymysql lxml fake-useragent requests-html
现代网站普遍采用多种反爬策略,我们的方案需要预先准备以下防御措施:
python复制from fake_useragent import UserAgent
import random
import time
# 动态User-Agent生成器
ua = UserAgent()
def get_random_headers():
return {
'User-Agent': ua.random,
'Accept-Language': 'zh-CN,zh;q=0.9',
'Referer': 'https://www.endata.com.cn/'
}
# 智能延迟策略
def random_delay(base=1, variance=3):
time.sleep(base + random.random() * variance)
提示:实际项目中建议将延迟基准值设置为2-3秒,方差控制在1-5秒之间,既保证效率又避免触发频率限制
票房数据网站通常采用动态渲染和复杂DOM结构,精准的XPath定位是关键。以下是经过实战验证的定位方案:
python复制def extract_movie_data(html):
"""使用相对XPath避免结构变动导致失效"""
from lxml import etree
tree = etree.HTML(html)
data = {
'name': tree.xpath('normalize-space(//h1[@class="movie-title"]/text())'),
'year': tree.xpath('substring-before(//span[contains(text(),"上映")]/text(),"年")'),
'area': tree.xpath('string(//dt[text()="制片地区"]/following-sibling::dd[1])'),
'poster': tree.xpath('//div[@class="poster"]/img/@src')[0],
'directors': parse_directors(tree),
'box_office': parse_box_office(tree)
}
return {k:v.strip() if isinstance(v,str) else v for k,v in data.items()}
关键改进点:
normalize-space()处理空白字符substring-before精准提取年份数字当遇到多个导演的情况,需要特殊处理分隔符和空白字符:
python复制def parse_directors(tree):
directors = tree.xpath('''
//dt[text()="导演"]/following-sibling::dd[
not(preceding-sibling::dt[text()="演员"])
]/a/text()
''')
return ', '.join([d.strip() for d in directors if d.strip()])
注意:避免直接使用字符串拼接,应先清洗数据并验证有效性
票房数据存在"亿"、"万"等不同单位,需要标准化为统一数值:
python复制def parse_box_office(tree):
raw_text = tree.xpath('string(//dt[text()="票房"]/following-sibling::dd[1])')
if not raw_text:
return 0.0
num_text = raw_text.replace(',', '').strip()
if '亿' in num_text:
return float(num_text.replace('亿', '')) * 10000
elif '万' in num_text:
return float(num_text.replace('万', ''))
else:
return float(num_text)
常见问题处理表:
| 原始格式 | 处理方式 | 示例输出 |
|---|---|---|
| "1.25亿" | 乘以10000 | 12500.0 |
| "3,456万" | 去逗号直接转换 | 3456.0 |
| "1234" | 直接转换 | 1234.0 |
| "N/A" | 返回0 | 0.0 |
建立完整的数据校验机制:
python复制def validate_movie_data(data):
"""验证数据完整性并设置默认值"""
required_fields = ['name', 'year', 'box_office']
for field in required_fields:
if not data.get(field):
raise ValueError(f"缺失必要字段: {field}")
# 设置默认值
data.setdefault('area', '未知地区')
data.setdefault('directors', '未知导演')
# 类型转换
try:
data['year'] = int(data['year'])
except (ValueError, TypeError):
data['year'] = 0
return data
使用连接池提升数据库操作效率:
python复制import pymysql
from dbutils.pooled_db import PooledDB
class MovieDB:
def __init__(self, config):
self.pool = PooledDB(
creator=pymysql,
maxconnections=5,
**config
)
def save_movie(self, data):
conn = self.pool.connection()
try:
with conn.cursor() as cursor:
sql = """INSERT INTO movies (
电影名称, 上映年份, 制片地区,
海报链接, 导演, 票房, 提交人
) VALUES (%s, %s, %s, %s, %s, %s, %s)"""
cursor.execute(sql, (
data['name'], data['year'], data['area'],
data['poster'], data['directors'],
data['box_office'], data['submitter']
))
conn.commit()
except Exception as e:
conn.rollback()
raise
finally:
conn.close()
连接池配置参数建议:
| 参数 | 推荐值 | 说明 |
|---|---|---|
| maxconnections | 5-10 | 根据服务器负载调整 |
| mincached | 2 | 初始连接数 |
| blocking | True | 连接耗尽时等待 |
| ping | 1 | 自动检查连接有效性 |
当需要处理大量数据时,应采用批量插入策略:
python复制def batch_insert_movies(db, movies):
sql = """INSERT INTO movies (
电影名称, 上映年份, 制片地区,
海报链接, 导演, 票房, 提交人
) VALUES (%s, %s, %s, %s, %s, %s, %s)"""
values = [(
m['name'], m['year'], m['area'],
m['poster'], m['directors'],
m['box_office'], m['submitter']
) for m in movies]
conn = db.pool.connection()
try:
with conn.cursor() as cursor:
cursor.executemany(sql, values)
conn.commit()
except Exception as e:
conn.rollback()
raise
finally:
conn.close()
注意:批量插入时每批建议控制在100-500条记录,避免SQL语句过长
将核心功能封装为可复用的Python模块:
python复制import xbot
from xbot import print as xprint
class MovieSpider:
def __init__(self, db_config):
self.db = MovieDB(db_config)
self.session = xbot.web.Session()
def crawl_page(self, url):
try:
xprint(f"开始处理页面: {url}")
html = self.session.get(url, headers=get_random_headers()).text
data = extract_movie_data(html)
data = validate_movie_data(data)
random_delay()
return data
except Exception as e:
xprint(f"页面处理失败: {url} - {str(e)}")
return None
def run(self, start_url, submitter):
page = self.session.get(start_url, headers=get_random_headers())
movie_links = page.xpath('//tbody/tr/td/a/@href')
success_count = 0
for link in movie_links:
full_url = f"https://www.endata.com.cn{link}"
movie_data = self.crawl_page(full_url)
if movie_data:
movie_data['submitter'] = submitter
self.db.save_movie(movie_data)
success_count += 1
xprint(f"任务完成,成功处理{success_count}/{len(movie_links)}条记录")
建立完善的错误处理机制:
python复制import logging
from logging.handlers import RotatingFileHandler
def setup_logger():
logger = logging.getLogger('movie_spider')
logger.setLevel(logging.INFO)
handler = RotatingFileHandler(
'movie_spider.log',
maxBytes=10*1024*1024,
backupCount=5
)
formatter = logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s'
)
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
# 在类中集成日志
class MovieSpider:
def __init__(self, db_config):
self.logger = setup_logger()
# ...其余初始化代码...
def crawl_page(self, url):
try:
self.logger.info(f"Processing page: {url}")
# ...抓取逻辑...
except Exception as e:
self.logger.error(f"Failed to process {url}: {str(e)}",
exc_info=True)
return None
日志记录最佳实践:
当需要大规模采集时,可考虑以下扩展方案:
python复制import redis
from rq import Queue
class DistributedSpider:
def __init__(self, redis_url, db_config):
self.redis_conn = redis.from_url(redis_url)
self.task_queue = Queue(connection=self.redis_conn)
self.db_config = db_config
def enqueue_task(self, url, submitter):
self.task_queue.enqueue(
'spider.tasks.process_movie_page',
url=url,
submitter=submitter,
db_config=self.db_config
)
def dispatch_tasks(self, start_url, submitter):
session = xbot.web.Session()
page = session.get(start_url, headers=get_random_headers())
movie_links = page.xpath('//tbody/tr/td/a/@href')
for link in movie_links:
full_url = f"https://www.endata.com.cn{link}"
self.enqueue_task(full_url, submitter)
分布式组件选型对比:
| 组件 | 适用场景 | 优点 | 缺点 |
|---|---|---|---|
| Redis RQ | 中小规模任务 | 简单易用 | 功能较基础 |
| Celery | 复杂工作流 | 功能强大 | 配置复杂 |
| Apache Airflow | 定时调度 | 可视化强 | 资源消耗大 |
对于需要定期更新的数据源,建议实现增量采集:
python复制def get_existing_movies(db):
"""获取已存在电影的哈希集合"""
conn = db.pool.connection()
try:
with conn.cursor() as cursor:
cursor.execute("SELECT MD5(CONCAT(电影名称,上映年份)) FROM movies")
return {row[0] for row in cursor.fetchall()}
finally:
conn.close()
def is_new_movie(db, movie_data):
"""检查是否为新增记录"""
import hashlib
key = hashlib.md5(
f"{movie_data['name']}{movie_data['year']}".encode('utf-8')
).hexdigest()
return key not in get_existing_movies(db)
这种基于内容哈希的比对方式,相比单纯依赖ID或时间戳更加可靠