第一次用Python的Requests库抓取Shopee商品页面时,我盯着返回的乱码数据愣了十分钟——既不是熟悉的HTML结构,也不是预期的JSON响应,而是一堆毫无意义的符号组合。这种经历在电商数据采集领域太常见了,特别是面对Shopee这类东南亚头部电商平台时。
乱码现象背后是平台精心设计的多层防御体系。最表层的问题表现为三种典型症状:
<div id="app"></div>这类空壳)1,2,3,45,12,5...)深层原因则涉及Shopee的动态渲染架构和行为验证系统。商品数据通过JavaScript动态加载,核心接口需要携带加密签名参数。更棘手的是,平台会实时检测以下特征:
sec-ch-ua等现代浏览器头)python复制# 典型失败案例:直接请求商品页面
import requests
url = "https://shopee.sg/iPhone-15-Pro-256GB-i.123456.789012345"
response = requests.get(url)
print(response.text[:200]) # 输出可能是: ",1,3,5,2,1..." 或空壳HTML
破解乱码困局的关键在于绕过前端渲染,直接获取后端API的原始JSON数据。经过上百次测试,我总结出这些有效方法:
在Chrome中打开Shopee商品页,按F12进入开发者工具:
典型的数据接口特征:
/api/v4/item/getx-api-source等特殊标记Shopee对移动端的反爬策略相对宽松,可通过抓包工具捕获APP通信:
mtop.taobao或shopee.api的域名x-sign等鉴权参数python复制# 模拟移动端API请求示例
headers = {
"User-Agent": "ShopeeMobile/3.15.1 (iPhone; iOS 16.4)",
"X-Requested-With": "XMLHttpRequest",
"X-Api-Source": "rn" # React Native标识
}
params = {
"itemid": 123456789,
"shopid": 987654321,
"signature": "加密签名需逆向获取"
}
使用Playwright这类现代自动化工具,可以监听网络请求而不触发反爬:
python复制from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
def intercept_response(response):
if "/api/v2/item/get" in response.url:
print(response.json()) # 原始JSON数据
page.on("response", intercept_response)
page.goto("https://shopee.vn/product-example")
page.wait_for_timeout(5000)
browser.close()
对于加密参数,需要逆向解析前端代码:
encrypt、sign等关键词单次获取数据只是开始,要实现持续稳定采集需要系统化方案。我的生产环境配置包含这些关键模块:
使用browser-fingerprint库生成真实指纹:
python复制from browser_fingerprint import generate_fingerprint
fingerprint = generate_fingerprint(
os="windows",
os_version="10",
browser="chrome",
browser_version="120"
)
headers.update({
"sec-ch-ua": fingerprint["sec_ch_ua"],
"sec-ch-ua-platform": fingerprint["sec_ch_ua_platform"]
})
优质代理需要满足:
推荐代理测试方法:
python复制def test_proxy(proxy):
try:
resp = requests.get(
"https://shopee.sg/api/v1/ping",
proxies={"https": proxy},
timeout=10
)
return resp.status_code == 200
except:
return False
针对Shopee常见的三种验证码:
python复制# 滑块验证破解示例
import cv2
import numpy as np
def calculate_slide_distance(bg_path, slider_path):
bg = cv2.imread(bg_path, 0)
slider = cv2.imread(slider_path, 0)
res = cv2.matchTemplate(bg, slider, cv2.TM_CCOEFF_NORMED)
_, max_val, _, max_loc = cv2.minMaxLoc(res)
return max_loc[0]
智能调度需要:
python复制import random
import time
class RequestScheduler:
def __init__(self):
self.last_request_time = 0
def get_delay(self):
base = 2 + random.random() * 3 # 2-5秒基础延迟
if time.localtime().tm_hour > 20:
base *= 1.5 # 夜间增加延迟
return max(base, time.time() - self.last_request_time)
原始JSON需要处理:
python复制def clean_shopee_data(raw_json):
return {
"product_id": raw_json["itemid"],
"price": float(raw_json["price"]) / 100000 if "price" in raw_json else None,
"image_urls": [
f"https://cf.shopee.sg/file/{img_id}"
for img_id in raw_json.get("images", [])
],
"attributes": {
attr["name"]: attr["value"]
for attr in raw_json.get("attributes", [])
}
}
将上述组件整合为完整解决方案,这个Python类包含了我在实际项目中验证过的核心逻辑:
python复制import asyncio
from playwright.async_api import async_playwright
from dataclasses import dataclass
import json
import random
@dataclass
class ShopeeConfig:
proxy: str
user_agent: str
viewport: dict = None
class ShopeeScraper:
def __init__(self, config: ShopeeConfig):
self.config = config
self.request_count = 0
async def _intercept_api(self, response):
if "/api/v4/item/get" in response.url:
try:
data = await response.json()
if data.get("error") == 0:
self.request_count += 1
return data["data"]
except:
pass
async def scrape_product(self, product_url):
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
proxy={"server": self.config.proxy}
)
context = await browser.new_context(
user_agent=self.config.user_agent,
viewport=self.config.viewport or {"width": 1920, "height": 1080}
)
page = await context.new_page()
# 设置随机鼠标移动轨迹
await page.evaluate("""() => {
window.MouseEvent.prototype.old = window.MouseEvent.prototype.constructor;
window.MouseEvent.prototype.constructor = function(type, init) {
if (init) init.movementX = Math.floor(Math.random() * 10);
return new this.old(type, init);
}
}""")
product_data = None
page.on("response", lambda r: asyncio.create_task(
self._intercept_api(r).then(lambda d: setattr(self, 'product_data', d)))
)
await page.goto(product_url, wait_until="networkidle")
await page.mouse.move(100, 100)
await page.wait_for_timeout(random.randint(1000, 3000))
if not self.product_data:
await page.evaluate("window.scrollBy(0, 500)")
await page.wait_for_timeout(2000)
await browser.close()
return self.product_data
# 使用示例
config = ShopeeConfig(
proxy="http://user:pass@gate.proxyprovider.com:8080",
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36..."
)
scraper = ShopeeScraper(config)
data = asyncio.run(scraper.scrape_product("https://shopee.ph/iPhone-15-i.12345.67890"))
关键异常处理策略:
平台的反爬机制平均每3-6个月会有重大更新。这些是我近期发现的有效对策:
新版Shopee开始通过WebSocket推送价格数据:
python复制async def handle_websocket(ws):
async for msg in ws:
if msg.type == "text" and "price" in msg.data:
print(json.loads(msg.data))
async def monitor_prices(product_id):
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
async with page.expect_websocket() as ws_info:
await page.goto(f"https://shopee.sg/product/{product_id}")
ws = await ws_info.value
await handle_websocket(ws)
部分数据改用GraphQL查询:
operationName和query参数.graphql文件中的查询模板Shopee新增的内存检测包括:
performance.memory对象检查解决方案是在启动浏览器时注入补丁:
javascript复制await page.add_init_script("""
delete Performance.prototype.memory;
window.WebAssembly = undefined;
""")
大规模采集建议采用:
python复制import redis
from redlock import RedLock
r = redis.Redis()
lock = RedLock("shopee_rate_limit", connection_details=[r])
def safe_request(url):
with lock:
if r.get("req_count") > 100:
time.sleep(60)
return requests.get(url)