Process PDF by Python(pdfminer)

Portable Document Format(可移植文档格式)即PDF文件格式,是由Adobe发明,目前由国际标准化组织(ISO)维护的开放标准。

在Python中处理PDF有以下途径:

  • pdfminer系列,比较专业的文本提取工具。包括pdfminer、pdfminer.six等
    • pdfplumber 基于PDFMiner系列的高效提取pdf提取工具
  • PyPDF2 也是一款比较专业有口碑的python PDF处理工具。不仅支持文本,还支持元数据提取,以及其他分割、合并等编辑。支持Python3。
  • PyPDF4 pypdf2停止更新之后,分支出来的可能还在更新的新版本。

pdfminer系列工具

  • pdfminer是比较得到大家公认的PDF提取工具,然鹅更新停止在python2,从python3大行其道之后,大家就另寻新工具吧。
  • pdfminer.sixpython3来了,”幸运”的是,我们后来又了pdfminer.six分支,大家会及时的用脚投票,踩出一条新路。为什么是six呢,[Fork of PDFMiner using six for Python 2+3 compatibility]

pdfminer.six 安装

pip install pdfminer.six

import pdfminer
help(pdfminer)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
Help on package pdfminer:

NAME
pdfminer - Fork of PDFMiner using six for Python 2+3 compatibility

DESCRIPTION
PDFMiner is a tool for extracting information from PDF documents.
Unlike other PDF-related tools, it focuses entirely on getting and analyzing
text data. PDFMiner allows to obtain the exact location of texts in a page,
as well as other information such as fonts or lines.
It includes a PDF converter that can transform PDF files into other text
formats (such as HTML). It has an extensible PDF parser that can be used for
other purposes instead of text analysis.

PACKAGE CONTENTS
arcfour
ascii85
ccitt
cmapdb
converter
encodingdb
fontmetrics
glyphlist
high_level
image
latin_enc
layout
lzw
pdfcolor
pdfdevice
pdfdocument
pdffont
pdfinterp
pdfpage
pdfparser
pdftypes
psparser
rijndael
runlength
settings
utils

VERSION
20181108

pdfminer.six 使用

pdfminer系列的接口都比较繁琐,不太方便使用,文档并不全。如果有需要的同学,当然最好是去研究源代码,而只是随便用一下,最方便的就是记一下例子,然后需要的时候拿出来改改。

取pdf文件全部文字

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()


#close open handles
converter.close()
fake_file_handle.close()

if text:
return text

if __name__ == '__main__':
print(extract_text_from_pdf('./samplePDF/SF424_page2.pdf'))

按页提取pdf中文字

如果不是一次提取所有文字,也可以按页提取。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

# miner_text_generator.py
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
def extract_text_by_page(pdf_path):
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text
# close open handles
converter.close()
fake_file_handle.close()
def extract_text(pdf_path):
for page in extract_text_by_page(pdf_path):
print(page)
print()
if __name__ == '__main__':
print(extract_text('./samplePDF/SF424_page2.pdf'))

提取文件输出至XML文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# xml_exporter.py
import os
import xml.etree.ElementTree as xml
#from miner_text_generator import extract_text_by_page
from xml.dom import minidom
def export_as_xml(pdf_path, xml_path):
filename = os.path.splitext(os.path.basename(pdf_path))[0]
root = xml.Element('{filename}'.format(filename=filename))
pages = xml.Element('Pages')
root.append(pages)
counter = 1
for page in extract_text_by_page(pdf_path):
text = xml.SubElement(pages, 'Page_{}'.format(counter))
text.text = page[0:700]
counter += 1
tree = xml.ElementTree(root)
xml_string = xml.tostring(root, 'utf-8')
parsed_string = minidom.parseString(xml_string)
pretty_string = parsed_string.toprettyxml(indent=' ')
with open(xml_path, 'w') as fh:
fh.write(pretty_string)
#tree.write(xml_path)

if __name__ == '__main__':
pdf_path = './samplePDF/SF424_page2.pdf'
xml_path = './samplePDF/SF424_page2.xml'
export_as_xml(pdf_path, xml_path)

提取文件输出至JSON文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# json_exporter.py
import json
import os
#from miner_text_generator import extract_text_by_page
def export_as_json(pdf_path, json_path):
filename = os.path.splitext(os.path.basename(pdf_path))[0]
data = {'Filename': filename}
data['Pages'] = []
counter = 1
for page in extract_text_by_page(pdf_path):
text = page[0:100]
page = {'Page_{}'.format(counter): text}
data['Pages'].append(page)
counter += 1
with open(json_path, 'w') as fh:
json.dump(data, fh)
if __name__ == '__main__':
pdf_path = './samplePDF/SF424_page2.pdf'
json_path = './samplePDF/SF424_page2.json'
export_as_json(pdf_path, json_path)

提取文件输出至CSV文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# csv_exporter.py
import csv
import os
#from miner_text_generator import extract_text_by_page
def export_as_csv(pdf_path, csv_path):
filename = os.path.splitext(os.path.basename(pdf_path))[0]
counter = 1
with open(csv_path, 'w') as csv_file:
writer = csv.writer(csv_file)
for page in extract_text_by_page(pdf_path):
text = page[0:100]
words = text.split()
writer.writerow(words)
if __name__ == '__main__':
pdf_path = './samplePDF/SF424_page2.pdf'
csv_path = './samplePDF/SF424_page2.csv'
export_as_csv(pdf_path, csv_path)

pdfminer工具

pdfminer在安装时,会安装一些脚本工具:

  • pdf2txt.py
  • dumppdf.py
1
2
3
4
5
6
7
8
9
10
11
pdf2txt.py

usage: pdf2txt.py [-h] [-d] [-p PAGENOS]
[--page-numbers PAGE_NUMBERS [PAGE_NUMBERS ...]]
[-m MAXPAGES] [-P PASSWORD] [-o OUTFILE] [-t OUTPUT_TYPE]
[-c CODEC] [-s SCALE] [-A] [-V] [-W WORD_MARGIN]
[-M CHAR_MARGIN] [-L LINE_MARGIN] [-F BOXES_FLOW]
[-Y LAYOUTMODE] [-n] [-R ROTATION] [-O OUTPUT_DIR] [-C] [-S]
files [files ...]

pdf2txt.py -o ./samplePDF/sample.xml ./samplePDF/SF424_page2.pdf