| 知乎专栏 |
安装
pip install docx2pdf
macOS 还需要安装
brew install aljohri/-/docx2pdf
命令行演示
查看帮助信息 docx2pdf --help
neo@MacBook-Pro-Neo ~ % docx2pdf
usage: docx2pdf [-h] [--keep-active] [--version] input [output]
Example Usage:
Convert single docx file in-place from myfile.docx to myfile.pdf:
docx2pdf myfile.docx
Batch convert docx folder in-place. Output PDFs will go in the same folder:
docx2pdf myfolder/
Convert single docx file with explicit output filepath:
docx2pdf input.docx output.docx
Convert single docx file and output to a different explicit folder:
docx2pdf input.docx output_dir/
Batch convert docx folder. Output PDFs will go to a different explicit folder:
docx2pdf input_dir/ output_dir/
positional arguments:
input input file or folder. batch converts entire folder or convert single file
output output file or folder
optional arguments:
-h, --help show this help message and exit
--keep-active prevent closing word after conversion
--version display version and exit
代码演示
from docx2pdf import convert
convert("input.docx")
convert("input.docx", "output.pdf")
convert("my_docx_folder/")
如果只是转换一个文档,我们就没有必要用Python了。下面的程序是批量转换指定目录下的 Word 文档。
import os
import glob
from pathlib import Path
# 指定转换目录
path = os.getcwd() + '/'
p = Path(path)
files=list(p.glob("**/*.docx"))
for file in files:
convert(file,f"{file}.pdf")
neo@MacBook-Pro-Neo ~/workspace/python % pip install pdfplumber
查看 pdfplumber 是否安装成功
neo@MacBook-Pro-Neo ~/workspace/python % pip show pdfplumber Name: pdfplumber Version: 0.5.26 Summary: Plumb a PDF for detailed information about each char, rectangle, and line. Home-page: https://github.com/jsvine/pdfplumber Author: Jeremy Singer-Vine Author-email: jsvine@gmail.com License: UNKNOWN Location: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages Requires: pdfminer.six, Pillow, Wand Required-by:
import os,pdfplumber
import pandas as pd
file = os.path.expanduser("~/tmp/每日开放式基金净值表.pdf")
with pdfplumber.open(file) as pdf:
print(pdf.metadata)
输出结果
{'Producer': 'macOS 版本11.2.1(版号20D74) Quartz PDFContext', 'CreationDate': "D:20210227145013Z00'00'", 'ModDate': "D:20210227145013Z00'00'"}
import os,pdfplumber
import pandas as pd
file = os.path.expanduser("~/tmp/每日开放式基金净值表.pdf")
with pdfplumber.open(file) as pdf:
print(len(pdf.pages))
import os,pdfplumber
import pandas as pd
file = os.path.expanduser("~/tmp/每日开放式基金净值表.pdf")
with pdfplumber.open(file) as pdf:
first_page = pdf.pages[0]
# 查看当前页码
print('页码:',first_page.page_number)
# 查看当前页宽
print('页宽:',first_page.width)
# 查看当前页高
print('页高:',first_page.height)
输出结果
neo@MacBook-Pro-Neo ~/workspace/python % python3 pdf.py 页码: 1 页宽: 1324 页高: 7638
import os,pdfplumber
import pandas as pd
file = os.path.expanduser("~/tmp/每日开放式基金净值表.pdf")
with pdfplumber.open(file) as pdf:
first_page = pdf.pages[0]
# 读取文本
text = first_page.extract_text()
print(text)
import os,pdfplumber
import pandas as pd
file = os.path.expanduser("~/tmp/每日开放式基金净值表.pdf")
with pdfplumber.open(file) as pdf:
first_page = pdf.pages[0]
table = first_page.extract_table()
print(table)
import os,pdfplumber
import pandas as pd
file = os.path.expanduser("~/tmp/每日开放式基金净值表.pdf")
with pdfplumber.open(file) as pdf:
first_page = pdf.pages[0]
table = first_page.extract_table()
# print(table)
for t in table:
print(t)
neo@MacBook-Pro-Neo ~/workspace/python % python3 pdf.py ['关注', '⽐较', '序号', '基⾦代码', '基⾦简称', '2021-02-26', None, '2021-02-25', None, '⽇增长值', '⽇增长率', '申购状态', '赎回状态', '⼿续费'] [None, None, None, None, None, '单位净值', '累计净值', '单位净值', '累计净值', None, None, None, None, None] ['', '', '1', '501030', '汇添富中证环境治理指数A 估值图 基⾦吧', '0.5501', '0.5501', '0.5421', '0.5421', '0.0080', '1.48%', '开放', '开放', '0.12%'] ['', '', '2', '501031', '汇添富中证环境治理指数C 估值图 基⾦吧', '0.5471', '0.5471', '0.5392', '0.5392', '0.0079', '1.47%', '开放', '开放', '0.00%'] ['', '', '3', '164908', '交银中证环境治理(LOF) 估值图 基⾦吧', '0.4890', '0.4890', '0.4820', '0.4820', '0.0070', '1.45%', '开放', '开放', '0.12%'] ['', '', '4', '004005', '东⽅民丰回报赢安混合A 估值图 基⾦吧', '1.0564', '1.0709', '1.0438', '1.0583', '0.0126', '1.21%', '开放', '开放', '0.06%'] ['', '', '5', '004006', '东⽅民丰回报赢安混合C 估值图 基⾦吧', '1.0463', '1.0593', '1.0338', '1.0468', '0.0125', '1.21%', '开放', '开放', '0.00%']
neo@MacBook-Pro-Neo ~/workspace/python % pip install openpyxl
import os,pdfplumber
import pandas as pd
file = os.path.expanduser("~/tmp/每日开放式基金净值表.pdf")
# 读取pdf文件,保存为excel
with pdfplumber.open(file) as pdf:
first_page = pdf.pages[0]
# 自动读取表格信息,返回列表
table = first_page.extract_table()
# print(table)
save = pd.DataFrame(table[2:],columns=table[0])
# 保存excel
save.to_excel('output.xlsx')