Manipulate PDF in Python¶
PyPDF2: open source, free. 老牌 Python PDF 项目.
PyMuPDF: open source only, need to buy license for commercial project. 全面且强大, 基于 C++ 写的 MuPDF. 但是 Python 包自带用的是预编译的 MuPDF binary, 所以无需 yum, apt install. 美中不足就是商用需要 License
pdf2image: open source, free. 功能很简单, 将 PDF 转化为 Image 图片, 底层用的是 popper 这个 PDF Render 工具. 需要用 yum, apt install CLI 之后才能使用.
pdfminer: MIT.
PyPDF2 Example:
# -*- coding: utf-8 -*-
import io
from pathlib import Path
from PyPDF2 import PdfReader, PdfWriter
dir_here = Path(__file__).absolute().parent
path_w2_pdf = dir_here.parent / "w2.pdf"
reader = PdfReader(io.BytesIO(path_w2_pdf.read_bytes()))
n_page = len(reader.pages)
for i in range(n_page):
writer = PdfWriter()
writer.add_page(reader.pages[i])
page = i + 1
buffer = io.BytesIO()
writer.write(buffer)
path_dst = dir_here / f"{page}.pdf"
path_dst.write_bytes(buffer.getvalue())
PyMuPDF Example:
# -*- coding: utf-8 -*-
"""
Reference:
- https://pypi.org/project/PyMuPDF/
"""
from io import BytesIO
import fitz # This is the PyMuPDF import name
from pathlib import Path
dir_here = Path(__file__).absolute().parent
# --- Set the sample PDF file path you want to test with
path_w2_pdf = dir_here.parent / "w2.pdf"
path_pdf = path_w2_pdf
# You can use either ``fitz.Document(filename=...)``
# or use bytes stream ``fitz.Document(stream=...)`` to read the PDF content
doc = fitz.Document(stream=path_pdf.read_bytes())
# Repair any issues (hopefully) before we hit them
# See this https://github.com/pymupdf/PyMuPDF/issues/856
buffer = BytesIO()
buffer.write(doc.write(clean=True, garbage=4)) # write the document to in-memory buffer
new_content = buffer.getvalue()
buffer.close()
doc = fitz.Document(stream=new_content)
for page_num, page in enumerate(doc, start=1):
# --- split page
doc1 = fitz.Document() # new empty PDF
# doc1.insert_pdf(doc, from_page=page_num-1, to_page=page_num-1)
doc1.insert_pdf(doc)
p = dir_here / f"{path_pdf.stem}-page-{page_num}.pdf"
# you cannot write document to io.BytesIO
doc1.save(f"{p}")
# --- convert page to image
pix: fitz.Pixmap = page.get_pixmap(dpi=200)
p = dir_here / f"{path_pdf.stem}-{page_num}.png"
# you cannot write pix map to io.BytesIO
pix.save(f"{p}", output="png")
pikepdf Example:
# -*- coding: utf-8 -*-
from pathlib import Path
from pikepdf import Pdf, PdfImage
dir_here = Path(__file__).absolute().parent
path_w2_pdf = dir_here.parent / "w2.pdf"
pdf = Pdf.open(f"{path_w2_pdf}")
for page_num, page in enumerate(pdf.pages, start=1):
# split page
dst = Pdf.new()
dst.pages.append(page)
path_dst = dir_here / f"page-{page_num}.pdf"
dst.save(f"{path_dst}")
pdf2image Example:
# -*- coding: utf-8 -*-
"""
- Pypi: https://pypi.org/project/pdf2image/
Dependencies:
Mac:
- Install `poppler for Mac <https://macappstore.org/poppler/>`_
- do ``brew install poppler``
- use ``brew list poppler`` to figure out the poppler bin folder, on my computer it is ``/opt/homebrew/Cellar/poppler/22.08.0/bin/``
Linux (Redhat):
- Install poppler for Linux ``sudo yum install poppler-utils``
- Check it is installed ``yum list poppler-utils``
"""
from pathlib import Path
from pdf2image import convert_from_path
dir_here = Path(__file__).absolute().parent
path_w2_pdf = dir_here.parent / "w2.pdf"
images = convert_from_path(
f"{path_w2_pdf}",
dpi=300,
fmt="png",
# poppler_path="/opt/homebrew/Cellar/poppler/22.08.0/bin/", # don't need this on Linux
)
for page_num, image in enumerate(images, start=1):
path_output = dir_here / f"page-{page_num}.png"
image.save(f"{path_output}")