Utente:Alex brollo/pdf.py

Da Wikisource.
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8  -*-

# backup di scripts per la gestione dei file pdf Opal Libri Intichi
# nota: i metadati sono su file excel e txt esterno (23581 pdf)

#import PyPDF2
from fpdf import FPDF
import os, re, sys
from PIL import Image,ImageDraw
from urllib import FancyURLopener
from time import sleep
from math import atan2,pi

opener=FancyURLopener({})
lista=unicode(open("ElencoOpalMod.txt").read(),"utf-8").split("\n")
el_main=""
el_curr=""

def grab(nome,folder="pdf_orig"):
    global lista
    if not nome in os.listdir(folder):
        elemento=[]
        for i in range(len(lista)):
            if lista[i].startswith(nome+"\t"):
                elemento=lista[i].split("\t")
                break
        if len(elemento)==0:
            return nome, " non trovato"

        url=os.path.join("http://www.opal.unito.it/psixsite",elemento[2],elemento[0])
        print url
        nome=elemento[0]


        f=opener.open(url).read()
        if not "File or directory not found." in f:
            open(os.path.join(folder,nome),"wb").write(f)
            print url, " salvato in ", folder+"/"+nome
        else:
            print url,"(file non scaricato)"
    else:
        print "Pdf presente in pdf_orig"

    return

    


    

def cleanup(folder=None):
    if folder==None:
        print "Deve essere specificata una directory"
        return
    lista=os.listdir(folder)
    for f in lista:
        try:
            os.remove(os.path.join(folder,f))
        except:
            continue
    return

def pdfinfo(pdf):
    comando="pdfinfo %s > temp/dati.txt" % pdf
    print comando
    result=os.system(comando)
    print result
    if result==0:
        testo=open("temp/dati.txt").read()
        pages=int(re.compile(r"Pages: +(\d+)").findall(testo)[0])
        comando="pdfinfo -box -f 1 -l %d %s >temp/dati.txt" % (pages,pdf)
        print comando
        result=os.system(comando)
        print result
        if result==0:
            testo=open("temp/dati.txt").read()
            info=parseinfo(testo)
        else:
            print "seconda chiamata a pdfinfo fallita"
            info={}
    else:
        print "prima chiamata a pdfinfo fallita"
        
        info={}
    return info
    
def cropJpg(info,split=True):
    cleanup("jpg")
    cleanup("jpg_mod")
    
    for imm in info:
        filename=info[imm]["file"]
        jpgOld=Image.open(os.path.join("images",filename))
        factor=jpgOld.size[0]/info[imm]["MediaBox"][2]
        box=(info[imm]["CropBox"][0]*factor,\
             (info[imm]["MediaBox"][3]-info[imm]["CropBox"][3])*factor,\
             info[imm]["CropBox"][2]*factor,\
             (info[imm]["MediaBox"][3]-info[imm]["CropBox"][1])*factor)
        
        jpgNew=jpgOld.crop(box)
        rotate=int(info[imm]["rotate"])
        if rotate != 0:
            jpgNew=jpgNew.rotate(-rotate, expand=True)
        jpgNew.save(os.path.join("jpg","p_%s.jpg" % imm.zfill(4)))
        if split:
            n=int(imm)*2-1
            jpgNew=paginaSplit(jpgNew)
            w,h=jpgNew.size
            for xx in range(w/2-100,w/2+100):
                if jpgNew.getpixel((xx,h/2)) == (255,0,0):
                    break
            retro=jpgNew.crop((0,0,xx-1,h))
            fronte=jpgNew.crop((xx+1, 0, w, h))
            
            retro.save(os.path.join("jpg_mod","p_%s.jpg" % str(n).zfill(4)))
            
            n+=1
            fronte.save(os.path.join("jpg_mod","p_%s.jpg" % str(n).zfill(4)))
            print n-1,n,
        else:
            jpgNew.save(os.path.join("jpg_mod","p_%s.jpg" % imm.zfill(4)))
            print imm,

    print
    return
        
def pdfOut(nomePdf,split=True,i=0):
    lista=os.listdir("jpg_mod")

    width,height=Image.open("jpg_mod/%s" % lista[1]).size
    pdf=FPDF(unit="pt", format= [width,height])
    if split and i==0:
        lista.pop(0)
    for image in lista:
        pdf.add_page()
        pdf.image(os.path.join("jpg_mod",image),0,0)
    pdf.output("pdf_mod/%s" % nomePdf,"F")
    return


        

def parseinfo(testo):
    info={}
    for l in testo.split("\n"):
        lc=l.split()
        if len(lc)>1 and lc[0]=="Page":
            if not lc[1] in info:
                info[lc[1]]={}
                info[lc[1]]["file"]="p-%s.jpg" % (str(int(lc[1])-1).zfill(4))
            if lc[2]=="size:":
                if "rotated" in l:
                    info[lc[1]]["rotate"]=lc[8]
                else:
                   info[lc[1]]["rotate"]="0"
            if "MediaBox" in l:
                mediabox=converti(lc[3:7])
                info[lc[1]]["MediaBox"]=mediabox
            if "CropBox" in l:
                cropbox=converti(lc[3:7])
                info[lc[1]]["CropBox"]=cropbox
    return info

def converti(box): # riceve una lista di stringhe float
    for i in range(len(box)):
        box[i]=float(box[i])

    return box

    
def pdfimages(pdfName, out="images"):
    cleanup(out)
    comando="pdfimages -j %s images/p" % pdfName
    print comando
    result=os.system(comando)
    return
    
# estrae le jpg da un pdf in puro python
##def extractJpg(pdfName,out="images"):
##    cleanup("images")
##    pdf = file(pdfName, "rb").read()
##
##    startmark = "\xff\xd8"
##    startfix = 0
##    endmark = "\xff\xd9"
##    endfix = 2
##    i = 0
##
##    njpg = 0
##    while True:
##        istream = pdf.find("stream", i)
##        if istream < 0:
##            break
##        istart = pdf.find(startmark, istream, istream+20)
##        if istart < 0:
##            i = istream+20
##            continue
##        iend = pdf.find("endstream", istart)
##        if iend < 0:
##            raise Exception("Didn't find end of stream!")
##        iend = pdf.find(endmark, iend-20)
##        if iend < 0:
##            raise Exception("Didn't find end of JPG!")
##         
##        istart += startfix
##        iend += endfix
##        #print "JPG %d from %d to %d" % (njpg, istart, iend)
##        print njpg,
##        jpg = pdf[istart:iend]
##        jpgfile = file(os.path.join(out,"jpg-%s.jpg" % str((njpg+1)).zfill(4)), "wb")
##        jpgfile.write(jpg)
##        jpgfile.close()
##         
##        njpg += 1
##        i = iend
##    print
##    return

### pickle utilities 

def carica_pcl(nome_file, folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file)
    contenuto=pickle.load(f)
    f.close()
    return contenuto

def salva_pcl(variabile,nome_file="dato",folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file,"w")
    pickle.dump(variabile, f)
    f.close()
    print "Variabile salvata nel file "+nome_file
    return



def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"):
    if side=="right":
        idip=stringa.rfind(idi)
    else:
        idip=stringa.find(idi)
    idfp=stringa.find(idf,idip+len(idi))+len(idf)
    if idip>-1 and idfp>0:
        if x!=None:
            while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                    idfp=stringa.find(idf,idfp)+len(idf)
                
        if dc==0:
            vvalore=stringa[idip+len(idi):idfp-len(idf)]
        else:
            vvalore=stringa[idip:idfp]
    else:
        vvalore=""
    return vvalore

def produci_lista(testo,idi,idf,dc=1,inizio=None):
    t=testo[:]
    lista=[]
    while not find_stringa(t,idi,idf,1,inizio)=="":
        el=find_stringa(t,idi,idf,1,inizio)
        t=t.replace(el,"",1)
        if dc==0:
            el=find_stringa(el,idi,idf,0,inizio)
        lista.append(el)
    return lista

def go(nome,split=True):
    print "## cerco o scarico il file"
    grab(nome)
    print
    print "## creo info"
    info=pdfinfo("pdf_orig/"+nome)

    print "## estraggo immagini"
    pdfimages("pdf_orig/"+nome)

    print "## elaboro immagini"
    cropJpg(info, split)

    print "##preparo pdf (in pdf_mod)"
    pdfOut(nome,split)
    return



def lineSample(imm,x,delta):
    h=imm.size[1]
    w=imm.size[0]
    dx=delta*1.0/h
    punti=0
    for y in range(0,h,h/20):
        pixel=imm.getpixel((x+dx*y,y))
        punti+=(pixel[0]+pixel[1]+pixel[2])
        #print x+dx*y,y,pixel
    

    return (int(punti/60),x,delta)

def paginaSplit(imm):
    h=imm.size[1]
    w=imm.size[0]
    dark=(255,0,0)
    for x in range(int(w*.45),int(w*.55),5):
        for dx in range(-50,50):
            punti=lineSample(imm,x,dx)
            if punti[0]<dark[0]:
                dark=punti
    draw=ImageDraw.Draw(imm)
    draw.line((dark[1],0,dark[1]+dark[2],h),fill=(255,0,0))
    del draw
    imm=imm.rotate(atan2(-dark[2],h)/pi*180,expand=True)
    return imm
    
def test(imm):
    h=imm.size[1]
    w=imm.size[0]
    dark=(255,0,0)
    for x in range(int(w*.45),int(w*.55),5):
        for dx in range(-50,50):
            punti=lineSample(imm,x,dx)
            if punti[0]<dark[0]:
                dark=punti
    draw=ImageDraw.Draw(imm)
    draw.line((dark[1],0,dark[1]+dark[2],h),fill=(255,0,0))
    del draw

    imm=imm.rotate(atan2(-dark[2],h)/pi*180,expand=True)

    return imm

def elabora_comando(comando):
	comando=comando.split(";")
	for i in range(len(comando)):
		comando[i]=comando[i].split(",")
		for j in range(len(comando[i])):
			comando[i][j]=comando[i][j].strip()
	return comando
    
'''
ipotesi sintassi elabora_comando

unico parametro suddiviso da ;
singoli elementi costituiti da uno o più elementi separati da ,
elemento 1: nome file pdf oppure numero d'ordine nella lista pdf (digit)
elemento 2,3,4...: nome statement + valore statement

ipotesi:
"1; 2; 3" elabora i pdf 1,2,3
"1, elimina 1,no split; 2, elimina 1-3; 3"





'''
def main(comando):

    split=True
    #parsing nome
    global lista, el_main, el_curr
    comando=elabora_comando(comando)
    # per ora esaminato solo il primo elemento
    for i in range(len(comando)):
        if comando[i][0].isdigit():

            el_curr=lista[int(comando[i][0])].split("\t")
            if i==0:
                el_main=el_curr[:]
            comando[i][0]=el_curr[0]
        nome=comando[i][0]
        if "no split" in comando[i]:
            split=False
        print comando

        
        print "## cerco o scarico il file"
        grab(nome)
        print
        print "## creo info"
        info=pdfinfo("pdf_orig/"+nome)

        print "## estraggo immagini"
        pdfimages("pdf_orig/"+nome)

        print "## elaboro immagini"
        cropJpg(info, split)

        print "##preparo pdf (in pdf_mod)"
        pdfOut(nome,split,i)
    return