作为一名长期从事数据采集工作的开发者,我经常遇到需要从各类网站抓取结构化数据的场景。Python凭借其丰富的库生态和简洁语法,成为网络爬虫开发的首选工具。今天我将通过一个实际案例——抓取千千音乐网站歌单数据,手把手带你完成一个完整的爬虫项目。
这个项目特别适合以下几类读者:
我推荐使用PyCharm作为开发环境,它提供了完善的Python项目管理和调试功能。以下是具体配置步骤:
bash复制mkdir music_spider && cd music_spider
python -m venv venv
source venv/bin/activate # Linux/Mac
venv\Scripts\activate # Windows
bash复制pip install requests beautifulsoup4 pandas
提示:建议固定依赖版本以避免兼容性问题,可使用
pip freeze > requirements.txt生成依赖清单
合理的目录结构能显著提升代码可维护性:
code复制/music_spider
├── /venv # 虚拟环境
├── /data # 存储爬取结果
├── utils.py # 公共函数
├── spider.py # 主爬虫脚本
└── requirements.txt # 依赖清单
我们以千千音乐歌单页面(https://music.91q.com/songlist/309271)为例,使用Chrome开发者工具(F12)分析:
关键发现:
通过调试Source面板中的JavaScript代码,我们发现sign参数是通过以下方式生成的:
javascript复制function generateSign(params) {
const secret = '8a6d8b7d2e9'; // 示例密钥,实际需要分析获取
return md5(params.toString() + secret);
}
对应的Python实现:
python复制import hashlib
def generate_sign(params: dict, secret: str) -> str:
param_str = '&'.join([f'{k}={v}' for k,v in sorted(params.items())])
return hashlib.md5((param_str + secret).encode()).hexdigest()
在utils.py中封装通用请求函数:
python复制import requests
from typing import Dict, Any
def make_request(url: str, params: Dict[str, Any], headers: Dict[str, str]) -> Dict:
try:
resp = requests.get(url, params=params, headers=headers, timeout=10)
resp.raise_for_status()
return resp.json()
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
在spider.py中实现核心功能:
python复制import pandas as pd
from utils import make_request, generate_sign
class MusicSpider:
BASE_URL = "https://music.91q.com/api/songlist"
SECRET = "8a6d8b7d2e9" # 示例密钥
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Referer": "https://music.91q.com"
}
def fetch_playlist(self, page: int, genre: str) -> list:
params = {
"page": page,
"type": genre,
"timestamp": int(time.time())
}
params["sign"] = generate_sign(params, self.SECRET)
data = make_request(self.BASE_URL, params, self.headers)
return data.get("list", [])
使用pandas库高效存储数据:
python复制def save_to_csv(data: list, filename: str):
df = pd.DataFrame(data)
# 处理字段缺失情况
df.fillna("", inplace=True)
# 去重处理
df.drop_duplicates(subset=["song_id"], inplace=True)
df.to_csv(f"data/{filename}", index=False, encoding="utf_8_sig")
完整的数据采集流程:
python复制def main():
spider = MusicSpider()
genres = ["华语", "欧美", "日语"]
all_data = []
for genre in genres:
for page in range(1, 6): # 抓取前5页
print(f"正在抓取 {genre} 第 {page} 页...")
data = spider.fetch_playlist(page, genre)
if data:
all_data.extend(data)
time.sleep(1) # 礼貌性延迟
save_to_csv(all_data, "playlists.csv")
python复制USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X)"
]
def get_random_agent():
return random.choice(USER_AGENTS)
python复制PROXIES = [
"http://proxy1.example.com:8080",
"http://proxy2.example.com:8080"
]
def make_request_with_proxy(url, params):
proxy = {"http": random.choice(PROXIES)}
return requests.get(url, params=params, proxies=proxy)
python复制import aiohttp
import asyncio
async def fetch_async(url, session):
async with session.get(url) as response:
return await response.json()
async def main_async():
async with aiohttp.ClientSession() as session:
tasks = [fetch_async(url, session) for url in urls]
return await asyncio.gather(*tasks)
python复制def load_existing_ids(filename):
try:
df = pd.read_csv(filename)
return set(df["song_id"].tolist())
except FileNotFoundError:
return set()
def filter_new_items(items, existing_ids):
return [item for item in items if item["song_id"] not in existing_ids]
原始代码中出现的类别乱序问题,本质上是由于:
解决方案:
python复制# 方法1:添加请求时间戳字段
df["fetch_time"] = pd.to_datetime("now")
df.sort_values("fetch_time", inplace=True)
# 方法2:使用同步队列
from queue import Queue
task_queue = Queue()
result_queue = Queue()
def worker():
while True:
task = task_queue.get()
result = process_task(task)
result_queue.put((task["page"], task["genre"], result))
task_queue.task_done()
python复制# 健壮性处理示例
def safe_extract(data, keys, default=""):
try:
for key in keys.split("."):
data = data[key]
return data
except (KeyError, TypeError):
return default
这个基础爬虫可以进一步扩展为:
python复制import matplotlib.pyplot as plt
def show_genre_distribution(df):
df["genre"].value_counts().plot(kind="bar")
plt.title("歌单类型分布")
plt.show()
我在实际开发中总结的几个关键经验: