windows - Sådan laver du et python-program til at konvertere en masse pdfs til html?

Indlæg af Hanne Mølgaard Plasc

Problem



Jeg forsøgte at lave et program, der ville læse alle pdfs i en mappe og konvertere dem alle til htmls, for eksempel file1.pdf, file2.pdf, file3.pdf Kør derefter programmet og lav noget som file1.html, file2 .html, file3.htm. Uden at miste hovedpdf'en selvfølgelig, kunne jeg indtil nu kun gøre det til en fil, det gør jeg ikke for hver fil i mappen med en loop.
Her er min kode:


import shlex
import subprocess
import os
import platform

def run(command):
    if platform.system() != 'Windows':
        args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors

# Change this to your PDF file base directory
base\_directory = 'C:\PROJECT\pdfs'
if not os.path.isdir(base\_directory):
    print "\%s is not a directory" \% base\_directory
    exit(1)
# Change this to your pdf2htmlEX executable location
#bin\_path = 'C:\Python27\pdf2htmlEX\pdf2htmlEX.exe'
#if not os.path.isfile(bin\_path):
#    print "Could not find \%s" \% bin\_path
#    exit(1)
for dir\_path, dir\_name\_list, file\_name\_list in os.walk(base\_directory):
    for file\_name in file\_name\_list:
        # If this is not a PDF file
        if not file\_name.endswith('.pdf'):
            # Skip it
            continue
        file\_path = os.path.join(dir\_path, file\_name)
        # Convert your PDF to HTML here
        args = (file\_name, file\_path)
        success, output, errors = run("pdf2txt.py -o \%s.html \%s" \%args)
        if not success:
            print "Could not convert \%s to HTML" \% file\_path
            print "\%s" \% errors

Bedste reference


Dette er en komplet løsning, der bruger os.walk og pdf2htmlEX: [4] [5]


import shlex
import subprocess
import os
import platform

def run(command):
    if platform.system() != 'Windows':
        args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors

# Change this to your PDF file base directory
base\_directory = 'C:\Users\Admin\Desktop\learningpython\PROJECT'
if not os.path.isdir(base\_directory):
    print "\%s is not a directory" \% base\_directory
    exit(1)
# Change this to your pdf2htmlEX executable location
bin\_path = 'C:\Python27\pdf2htmlEX-master\pdf2htmlEX.exe'
if not os.path.isfile(bin\_path):
    print "Could not find \%s" \% bin\_path
    exit(1)
for dir\_path, dir\_name\_list, file\_name\_list in os.walk(base\_directory):
    for file\_name in file\_name\_list:
        # If this is not a PDF file
        if not file\_name.endswith('.pdf'):
            # Skip it
            continue
        file\_path = os.path.join(dir\_path, file\_name)
        # Convert your PDF to HTML here
        args = (bin\_path, dir\_path, file\_path)
        success, output, errors = run("\%s --dest-dir \%s \%s" \% args)
        if not success:
            print "Could not convert \%s to HTML" \% file\_path
            print "\%s" \% errors

Andre referencer 1


at kompilere pdf2html projekt ved https://github.com/coolwanglu/pdf2htmlEX, og systemopkald cmd pdf2html af python [6]