Progetto:Bot/Programmi in Python per i bot/xml2dsed.py

Da Wikisource.
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8  -*-



from bs4 import BeautifulSoup
import lxml, pickle, re,sys
from os import system,listdir,getcwd,path

# variabili globali
'{{}}'
imgSize=[]
IAid=""
p=[]

def leggi(iaid, djvu):
    global IAid
    IAid=iaid
    global djvuName
    djvuName=djvu
    dump(djvu)
    global p
    if path.isfile(iaid+"_djvu.xml"):
        xml=open(iaid+"_djvu.xml","rb").read()
        while '<WORD coords="0,0,0,0' in xml:
            errore=find_stringa(xml,'<WORD coords="0,0,0,0',"</WORD>",1)
            print errore
            xml=xml.replace(errore,"")
            open(iaid+"_djvu.xml","wb").write(xml)
        p=coord(xml,"<OBJECT","</OBJECT>")
        salva_pcl(p,iaid,"")
    else:
        print "nessun file utilizzabile"
    return


def dump(djvuFile):
    global imgSize
    command="djvudump -o dump.txt %s" % (djvuFile)
    result=system(command)
    if result !=0:
        print "Errore in dump()"
        return
    d=open("dump.txt").read()
    imgSize=re.findall("DjVu (\d+)x(\d+)",d)
    return
    


# estrae una pagina (oggetto soup) dal file djvu.xml letto con leggi()
def pagina(n):
    f=open(IAid+"_djvu.xml","rb")
    f.seek(p[n][0])
    testo=f.read(p[n][1])
    bsPagina=BeautifulSoup(unicode(testo,"utf-8"),"xml")
    f.close()
    return bsPagina


# estrae il testo di una pagina, oggetto soup ottenuto con pagina()
def testoPag(soupPage):
    testo=""
    par=soupPage.find_all("PARAGRAPH")
    for i in par:
        lin=i.find_all("LINE")
        for l in lin:
            word=l.find_all("WORD")
            tl=[]
            for w in word:
                tl.append(w.get_text())
            tl=" ".join(tl)
            tl=tl.strip()
            testo+=tl+"\n"
        testo+="\n"
    return testo




def x2d(n):
    testo=""
    pag=pagina(n)
    if len(pag.find_all("OBJECT"))==0:
        return testo
    global imgSize
        
        
    testo="(page #### "
    width=int(pag.OBJECT["width"])
    height=int(pag.OBJECT["height"])
    for col in pag.find_all("PAGECOLUMN"):
        if len(col.find_all("WORD"))>0:
            testocol="\n (column #### "
            for reg in col.find_all("REGION"):
                if len(reg.find_all("WORD"))>0:
                    testoreg="\n  (region #### "
                    for par in reg.find_all("PARAGRAPH"):
                        if len(par.find_all("WORD"))>0:

                            testopar="\n   (para #### "
                            for lin in par.find_all("LINE"):
                                testolinea="\n    (line #### "
                                maxcoord="100000 100000 0 0"
                                for word in lin.find_all("WORD"):
                                    coord,maxcoord=xy(word["coords"],height,maxcoord,hpage=int(imgSize[n][1]))
                                    testoWord=replacer(word.get_text()) #.replace("\\",";").replace('"',r"'")

                                    testolinea+='\n     (word %s "%s")' % (coord,testoWord)
                                testolinea=testolinea.replace("####",rect(re.findall("word (\d+ \d+ \d+ \d+)",testolinea)))
                                testolinea+=")"
                                testopar+=testolinea
                            testopar=testopar.replace("####",rect(re.findall("line (\d+ \d+ \d+ \d+)",testopar)))                       
                            testopar+=")"
                            testoreg+=testopar
                    testoreg=testoreg.replace("####",rect(re.findall("para (\d+ \d+ \d+ \d+)",testoreg)))                            
                    testoreg+=")"
                    testocol+=testoreg
            testocol=testocol.replace("####",rect(re.findall("region (\d+ \d+ \d+ \d+)",testocol)))
            testocol+=")"
            testo+=testocol
    testo=testo.replace("####",rect(re.findall("column (\d+ \d+ \d+ \d+)",testo)))
    testo+=")"
    testo=testo.replace("\n (column #### )","").replace("\n (column 0 0 0 0 )","")
    return testo

def dsed(ini=None,fin=None):
    dsedFile="select\nremove-txt\n"
    if ini==None:
        ini=0
    if fin==None:
        fin=len(p)
    for i in range(ini,fin):
        testo=x2d(i)
        if testo!="":
            testo="select %d\nset-txt\n" % (i+1)+testo+"\n.\n"
            dsedFile+=testo
        print "pagina: ",i+1, "bytes:",len(testo)
    open("output.dsed","w").write(dsedFile.encode("utf-8"))
    open("%s.dsed" % (IAid),"w").write(dsedFile.encode("utf-8"))
    result=system('djvused %s -f output.dsed -s' % djvuName)
    if result==0:
        print "dsed caricato"
        djvuFix(djvuName)
    return
        
        
def replacer(testo):
    testo=testo.replace("\\",";")\
           .replace("'",u"\u2019")\
           .replace('"',r"'")\
           .replace(u"\xf9",u"\xfa")\
           .replace(u"\xec",u"\xed")\
           .replace("cosi",u"cos\xed")\
           .replace(u"\xe0",u"\xe1")
    return testo




def rect(lista):
    
    if lista==[]:
        return "0 0 0 0"
    for i in range(len(lista)):
        x=lista[i].split(" ")
        for j in range(len(x)):
            x[j]=int(x[j])
            lista[i]=x
    result=lista[0]
    for i in range(1,len(lista)):
        result[0]=min(result[0],lista[i][0])
        result[1]=min(result[1],lista[i][1])
        result[2]=max(result[2],lista[i][2])
        result[3]=max(result[3],lista[i][3])
    result="%d %d %d %d" % (result[0],result[1],result[2],result[3])
    return result


def xy(coords,height,maxcoord,hpage):
	
    fact=hpage*1.0/height
    coords=coords.split(",")[:4]
    coords[0]=str(int((int(coords[0]))*fact))
    coords[1]=str(int((height-int(coords[1]))*fact))
    coords[2]=str(int((int(coords[2]))*fact))
    coords[3]=str(int((height-int(coords[3]))*fact))

    maxcoord=maxcoord.split(" ")
    maxcoord[0]=str(min(int(maxcoord[0]),int(coords[0])))
    maxcoord[1]=str(min(int(maxcoord[1]),int(coords[1])))
    maxcoord[2]=str(max(int(maxcoord[2]),int(coords[2])))
    maxcoord[3]=str(max(int(maxcoord[3]),int(coords[3]))) 

    return (" ".join(coords)," ".join(maxcoord))
    
                        
    
def childr(el):
    for i in el.children:
        if i.name != None:
            print i.name,
            if "coords" in i.attrs:
                print i["coords"], i.get_text()
            print
            childr(i)
    return

def djvuFix(djvu):
    command="djvutxt %s -detail=page test.txt" % (djvu)
    result=system(command)
    if result != 0:
        print "errore in djvutxt"
        return
    f=open("test.txt").read().split("\n")
    n=1
    failed=[]
    l=[]
    for i in range(len(f)):
        if f[i].startswith("(page "):
            n+=1
        elif f[i].startswith("()"):
            n+=1
        elif f[i].startswith("failed"):
            l.append(n)
            n+=1
    print l
    if len(l)>0:
        for page in l:
            result=system('djvused %s -e "select %d; output-txt">dummy.txt' % (djvu,page))
            if result==10:
                result=system('djvused %s -e "select %d; remove-txt; save"' % (djvu,page))
                "Testo pagina ",page," corrotto, viene cancellato"
    return
    


    
    
# this finds offset and length of elements <page...>...</page> into abbyy xml file
def coord(f,s1,s2):
    l=[]
    d1=0
    while True:
        d1=f.find(s1,d1)
        if d1==-1:
            break
        d2=f.find(s2,d1)
        l.append((d1,d2+len(s2)-d1))
        d1=d2
    return l


        
### pickle utilities 

def carica_pcl(nome_file, folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file)
    contenuto=pickle.load(f)
    f.close()
    return contenuto

def salva_pcl(variabile,nome_file="dato",folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file,"w")
    pickle.dump(variabile, f)
    f.close()
    print "Variabile salvata nel file "+nome_file
    return



def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"):
    if side=="right":
        idip=stringa.rfind(idi)
    else:
        idip=stringa.find(idi)
    idfp=stringa.find(idf,idip+len(idi))+len(idf)
    if idip>-1 and idfp>0:
        if x!=None:
            while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                    idfp=stringa.find(idf,idfp)+len(idf)
                
        if dc==0:
            vvalore=stringa[idip+len(idi):idfp-len(idf)]
        else:
            vvalore=stringa[idip:idfp]
    else:
        vvalore=""
    return vvalore

def produci_lista(testo,idi,idf,dc=1,inizio=None):
    t=testo[:]
    lista=[]
    while not find_stringa(t,idi,idf,1,inizio)=="":
        el=find_stringa(t,idi,idf,1,inizio)
        t=t.replace(el,"",1)
        if dc==0:
            el=find_stringa(el,idi,idf,0,inizio)
        lista.append(el)
    return lista



def main(params):
    leggi(params[1],params[2])
    dsed()
    return

if __name__ == "__main__":

    djvu=sys.argv
    main(djvu)