作为一名长期使用Python处理文档的开发者,我发现PDF处理是日常工作中最常见的需求之一。无论是数据提取、格式转换还是内容编辑,掌握Python处理PDF的技能都能极大提升工作效率。本文将分享我多年来积累的PDF处理实战经验。
PDF(Portable Document Format)作为一种跨平台文档格式,广泛应用于各类场景。Python凭借丰富的库生态,提供了多种PDF处理方案。下面我将从基础操作到高级技巧,详细介绍如何使用Python高效处理PDF文件。
Python生态中有多个PDF处理库,各有侧重:
提示:对于新项目,建议优先考虑PyMuPDF(fitz),它在性能、功能和API设计上都有明显优势。
bash复制# 安装核心库
pip install pypdf2 pdfminer.six reportlab pdf2image pymupdf
# 额外依赖(图像处理)
pip install pillow
如果使用pdf2image进行转换,还需要安装poppler:
brew install popplersudo apt-get install poppler-utilspython复制from PyPDF2 import PdfReader
def inspect_pdf(filepath):
with open(filepath, 'rb') as f:
reader = PdfReader(f)
meta = reader.metadata
print(f"""
文档信息:
标题: {meta.title or '无'}
作者: {meta.author or '无'}
页数: {len(reader.pages)}
创建时间: {meta.creation_date}
""")
# 使用示例
inspect_pdf('sample.pdf')
python复制from PyPDF2 import PdfMerger
def merge_pdfs(output_path, *input_files):
merger = PdfMerger()
for file in input_files:
merger.append(file)
merger.write(output_path)
merger.close()
print(f"合并完成,保存至 {output_path}")
# 使用示例
merge_pdfs('merged.pdf', 'file1.pdf', 'file2.pdf', 'file3.pdf')
python复制from PyPDF2 import PdfReader, PdfWriter
def split_pdf(input_file, output_prefix, page_ranges):
reader = PdfReader(input_file)
for start, end in page_ranges:
writer = PdfWriter()
for i in range(start-1, end):
writer.add_page(reader.pages[i])
output_file = f"{output_prefix}_pages_{start}_to_{end}.pdf"
with open(output_file, 'wb') as f:
writer.write(f)
print(f"生成文件: {output_file}")
# 使用示例:拆分1-3页和4-5页
split_pdf('document.pdf', 'output', [(1,3), (4,5)])
pdfminer.six提供了强大的文本提取能力:
python复制from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
def extract_pdf_text(filepath, page_numbers=None):
laparams = LAParams(
line_overlap=0.5,
char_margin=2.0,
line_margin=0.5,
word_margin=0.1,
boxes_flow=0.5
)
text = extract_text(
filepath,
laparams=laparams,
page_numbers=page_numbers
)
return text.strip()
# 使用示例
text_content = extract_pdf_text('report.pdf', page_numbers=[0,1])
print(text_content[:500]) # 打印前500字符
python复制from PyPDF2 import PdfReader
def decrypt_pdf(input_file, password, output_file=None):
reader = PdfReader(input_file)
if reader.is_encrypted:
if reader.decrypt(password):
print("解密成功!")
if output_file:
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
with open(output_file, 'wb') as f:
writer.write(f)
print(f"已保存解密版本到 {output_file}")
return True
else:
print("密码错误!")
return False
else:
print("文件未加密")
return True
# 使用示例
decrypt_pdf('protected.pdf', 'mypassword', 'unprotected.pdf')
python复制from pdf2image import convert_from_path
import os
def pdf_to_images(pdf_path, output_folder, dpi=200, fmt='jpeg'):
os.makedirs(output_folder, exist_ok=True)
images = convert_from_path(
pdf_path,
dpi=dpi,
output_folder=output_folder,
fmt=fmt,
output_file='page'
)
print(f"转换完成,共生成 {len(images)} 张图片")
return images
# 使用示例
pdf_to_images('presentation.pdf', 'output_images', dpi=300)
python复制from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import Paragraph, SimpleDocTemplate
def create_pdf_report(output_file, title, content):
doc = SimpleDocTemplate(output_file, pagesize=A4)
styles = getSampleStyleSheet()
story = []
# 添加标题
title_style = styles['Title']
story.append(Paragraph(title, title_style))
# 添加内容
for paragraph in content:
p = Paragraph(paragraph, styles['BodyText'])
story.append(p)
doc.build(story)
print(f"报告已生成: {output_file}")
# 使用示例
report_title = "2023年度数据分析报告"
report_content = [
"本报告展示2023年度关键业务指标...",
"第一季度营收同比增长15%...",
"用户活跃度提升显著..."
]
create_pdf_report('annual_report.pdf', report_title, report_content)
python复制from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import Table, TableStyle
from reportlab.graphics.shapes import Drawing
from reportlab.graphics.charts.barcharts import VerticalBarChart
def create_pdf_with_table_chart(output_file):
doc = SimpleDocTemplate(output_file, pagesize=letter)
story = []
# 创建表格
data = [
['季度', '营收(万)', '利润(万)'],
['Q1', '1200', '350'],
['Q2', '1500', '420'],
['Q3', '1800', '500'],
['Q4', '2100', '600']
]
table = Table(data)
table.setStyle(TableStyle([
('BACKGROUND', (0,0), (-1,0), colors.grey),
('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
('ALIGN', (0,0), (-1,-1), 'CENTER'),
('FONTSIZE', (0,0), (-1,0), 14),
('BOTTOMPADDING', (0,0), (-1,0), 12),
('BACKGROUND', (0,1), (-1,-1), colors.beige),
('GRID', (0,0), (-1,-1), 1, colors.black)
]))
story.append(table)
# 添加柱状图
drawing = Drawing(400, 200)
bc = VerticalBarChart()
bc.x = 50
bc.y = 50
bc.height = 125
bc.width = 300
bc.data = [[350, 420, 500, 600]]
bc.categoryAxis.categoryNames = ['Q1', 'Q2', 'Q3', 'Q4']
bc.valueAxis.valueMin = 0
drawing.add(bc)
story.append(drawing)
doc.build(story)
print(f"带表格图表的PDF已生成: {output_file}")
create_pdf_with_table_chart('financial_report.pdf')
问题1:中文显示乱码
python复制from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
pdfmetrics.registerFont(TTFont('SimSun', 'SimSun.ttf'))
style = ParagraphStyle('chinese', fontName='SimSun', fontSize=12)
问题2:处理扫描版PDF
python复制import pytesseract
from pdf2image import convert_from_path
def ocr_pdf(pdf_path):
images = convert_from_path(pdf_path)
text = ""
for i, img in enumerate(images):
text += f"=== Page {i+1} ===\n"
text += pytesseract.image_to_string(img, lang='chi_sim')
return text
问题3:大文件处理内存不足
python复制from PyPDF2 import PdfReader
def process_large_pdf(filepath):
with open(filepath, 'rb') as f:
reader = PdfReader(f)
for page in reader.pages:
# 逐页处理
text = page.extract_text()
# 处理逻辑...
python复制from PyPDF2 import PdfWriter
with PdfWriter() as writer:
for file in input_files:
reader = PdfReader(file)
writer.append(reader)
writer.write('output.pdf')
python复制from functools import lru_cache
@lru_cache(maxsize=10)
def get_pdf_reader(filepath):
return PdfReader(filepath)
python复制from concurrent.futures import ThreadPoolExecutor
def process_page(page):
return page.extract_text()
with ThreadPoolExecutor() as executor:
texts = list(executor.map(process_page, reader.pages))
%PDFpython复制def is_valid_pdf(filepath):
with open(filepath, 'rb') as f:
header = f.read(4)
return header == b'%PDF'
pdf-redact-tools删除元数据python复制from PyPDF2 import PdfReader
def check_hidden_content(filepath):
reader = PdfReader(filepath)
for page in reader.pages:
if '/Hidden' in page.get('/Annots', []):
print("发现隐藏内容!")
python复制import pandas as pd
from reportlab.lib import colors
from reportlab.platypus import Table
def df_to_pdf_table(df, output_file):
doc = SimpleDocTemplate(output_file)
data = [df.columns.values.tolist()] + df.values.tolist()
table = Table(data)
style = TableStyle([
('BACKGROUND', (0,0), (-1,0), colors.grey),
('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
('ALIGN', (0,0), (-1,-1), 'CENTER'),
('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
('FONTSIZE', (0,0), (-1,0), 12),
('BOTTOMPADDING', (0,0), (-1,0), 12),
('BACKGROUND', (0,1), (-1,-1), colors.beige),
('GRID', (0,0), (-1,-1), 1, colors.black)
])
table.setStyle(style)
doc.build([table])
print(f"DataFrame已保存为PDF表格: {output_file}")
# 使用示例
data = {'Month': ['Jan', 'Feb', 'Mar'], 'Sales': [1000, 1500, 2000]}
df = pd.DataFrame(data)
df_to_pdf_table(df, 'sales_report.pdf')
python复制from difflib import unified_diff
from PyPDF2 import PdfReader
def compare_pdfs(file1, file2):
text1 = extract_pdf_text(file1)
text2 = extract_pdf_text(file2)
lines1 = text1.splitlines()
lines2 = text2.splitlines()
diff = unified_diff(
lines1, lines2,
fromfile=file1,
tofile=file2,
lineterm=''
)
return '\n'.join(diff)
# 使用示例
difference = compare_pdfs('v1.pdf', 'v2.pdf')
with open('diff.txt', 'w') as f:
f.write(difference)
print("差异已保存到diff.txt")
python复制from fastapi import FastAPI, UploadFile, File
from fastapi.responses import FileResponse
app = FastAPI()
@app.post("/merge-pdfs/")
async def merge_pdfs(files: list[UploadFile] = File(...)):
merger = PdfMerger()
for file in files:
merger.append(file.file)
output_path = "merged.pdf"
merger.write(output_path)
merger.close()
return FileResponse(output_path)
@app.post("/extract-text/")
async def extract_text(file: UploadFile = File(...)):
text = extract_pdf_text(file.file)
return {"text": text}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
在实际项目中,我发现PyMuPDF(fitz)的性能通常比其他库高出一个数量级,特别是处理大型PDF文件时。以下是一个性能对比示例:
python复制import time
import fitz # PyMuPDF
def benchmark(pdf_path):
# PyPDF2
start = time.time()
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text()
pypdf2_time = time.time() - start
# PyMuPDF
start = time.time()
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
pymupdf_time = time.time() - start
print(f"PyPDF2: {pypdf2_time:.2f}s")
print(f"PyMuPDF: {pymupdf_time:.2f}s")
benchmark('large_document.pdf')
对于需要处理大量PDF的企业级应用,我建议考虑以下架构:
一个典型的PDF处理流程优化后,处理时间可以从分钟级降到秒级。在我的实践中,通过合理选择工具链和优化处理流程,成功将每月处理的十万份PDF文档的处理时间从8小时缩短到45分钟。