Progetto:Bot/Programmi in Python per i bot/OCRfix.py

Da Wikisource.
#!/usr/bin/python
# -*- coding: utf-8  -*-


import os,shutil,sys,urllib,re,argparse



def dsedExtr(djvuBase):
        dsed=unicode(open(dsedBase+".dsed").read(),"utf-8")
        print len(dsed)
        return
def cleanup(dsed):
        # dsed=unicode(open(djvuBase+".dsed").read(),"utf-8").split("\n")
        re_char=re.compile(ur'\(char \d+ \d+ \d+ \d+ ".*"\)')
        re_word=re.compile(ur'\(word (\d+) (\d+) (\d+) (\d+) "(.+)"\)')
        rl=re.compile("word (\d+) (\d+) (\d+) (\d+)")
        rp=re.compile("line (\d+) (\d+) (\d+) (\d+)")
        dsedNew=dsed[0]
        openPara=False
        openLine=False
        for i in range(1,len(dsed)):
                if "(char " in dsed[i]:
                        continue
                elif "(word " in dsed[i]:
                        if not openPara:
                                dsedNew+="\n (para 0 0 0 0"
                                openPara=True
                        if not openLine:
                                dsedNew+="\n  (line 0 0 0 0"
                                openLine=True

                        if r'"\n' in dsed[i]: #inizio nuova linea
                                dsedNew+=u")"
                                #dsedNew=fixDsed(dsedNew,"line")
                                dsedNew+="\n  (line 0 0 0 0"
                                openLine=True
                        if r'\n"' in dsed[i]: #fine paragrafo
                                dsed[i]+="))"
                                #dsedNew=fixDsed(dsedNew,"para")
                                openPara=False
                                openLine=False
                        dsedNew+="\n  "+dsed[i].replace(r"\n","")
                else:
                        if "(page " in dsed[i]:
                                print "*",
                        dsedNew+="\n"+dsed[i]
        dsedNew=fixDsed(dsedNew)
        
        #open("test.dsed","w").write(dsedNew.encode("utf-8"))
        return dsedNew

def fixDsed(testo):
        lista=testo.split("(line 0 0 0 0")
        for i in range(1,len(lista)):
                lista[i]="(line 0 0 0 0"+lista[i]
                mw=maxWord(lista[i])
                lista[i]=lista[i].replace("line 0 0 0 0","line %d %d %d %d" %  mw)
        testo="".join(lista)
        lista=testo.split("(para 0 0 0 0")
        for i in range(1,len(lista)):
                lista[i]="(para 0 0 0 0"+lista[i]
                mw=maxWord(lista[i])
                lista[i]=lista[i].replace("para 0 0 0 0","para %d %d %d %d" %  mw)
        testo="".join(lista)
        return testo


def maxWord(testo):
        rl=re.compile("word (\d+) (\d+) (\d+) (\d+)")
        w=rl.findall(testo)
        wm=[20000,20000,0,0]
        for i in w:
                wm[0]=min(wm[0],int(i[0]))
                wm[1]=min(wm[1],int(i[1]))
                wm[2]=max(wm[2],int(i[2]))
                wm[3]=max(wm[3],int(i[3]))
        return(wm[0],wm[1],wm[2],wm[3])
                          
           
def search(djvuBase):
        command='djvu\djvused -u "%s" -e output-txt > "%s"' % (djvuBase+".djvu",djvuBase+".dsed")
        print "Comando: ",command
        result=os.system(command)
        print "Risultato: ",result
        regex = ur"\d+ \d+ \d+ \d+ \"(.+)\"\)"
        lista=[]
        dsed=unicode(open(djvuBase+".dsed").read(),"utf-8")
        if (not "(line " in dsed) and (not "(line " in dsed):
                print "Normalizzo dsed"
                if not ("(para" in dsed or "(line" in dsed):
                        dsed=cleanup(dsed.split("\n"))
                        print "Dsed normalizzato"
                open(djvuBase+".dsed","w").write(dsed.encode("utf-8"))

        matches = re.finditer(regex, dsed, re.MULTILINE)

        for matchNum, match in enumerate(matches):
            matchNum = matchNum + 1
            
            # print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
            
            for groupNum in range(0, len(match.groups())):
                groupNum = groupNum + 1
                
                t=match.group(groupNum)
                
                if not t in lista:
                        
                        lista.append(t)
                        if len(lista)%5000==0:
                                print "Memorizzate ",len(lista), " parole"

        # Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
        lista.sort()
        testo="\n".join(lista)
        open(djvuBase+".txt","w").write(testo.encode("utf-8"))
        open(djvuBase+"_fix.txt","w").write(testo.encode("utf-8"))
        return "fatto"

def diff(djvuBase):
        fwrong=unicode(open(djvuBase+".txt").read(),"utf-8").split("\n")
        fright=unicode(open(djvuBase+"_fix.txt").read(),"utf-8").split("\n")
        
        if len(fwrong)!=len(fright):
                print "I file di correzione non corrispondono per numero di parole"
                return
        l=[]
        for i in range(len(fwrong)):
                if fwrong[i]!=fright[i]:
                        l.append(fwrong[i]+"\t"+fright[i])
        open(djvuBase+"_diff.txt","w").write(("\n".join(l)).encode("utf-8"))
        return 

def fixDjvu(djvuBase):
        lista=unicode(open(djvuBase+"_diff.txt").read(),"utf-8").split("\n")
        dsed=unicode(open(djvuBase+".dsed").read(),"utf-8")
        for i in range(len(lista)):
                parola=lista[i].split("\t")
                if len(parola)==2 and len(parola[0])>0:
                        dsed=dsed.replace('"'+parola[0]+'"','"'+parola[1]+'"')
                        # print "sostituzione ",parola[0]," -> ",parola[1]
        open(djvuBase+".dsed","w").write(dsed.encode("utf-8"))
        comando="djvused %s.djvu -f  %s.dsed -s" %(djvuBase,djvuBase)
        os.system(comando)
        print "Edited words has been uploaded into %s" % (djvuBase+".djvu")
        return

def step1(djvuBase):
        search(djvuBase)
        print '''Please open the file %s and edit it with a utf-8 text editor.
Don't change the number of rows!
Finally save the file and run step 2''' % (djvuBase+"_fix.txt")
        return

def step2(djvuBase):
        diff(djvuBase)
        fixDjvu(djvuBase)
        return "Done"

def path2url(path):
        return urlparse.urljoin('file:', urllib.pathname2url(path))
        
def cleanfolder(dirpath):
        for filename in os.listdir(dirpath):
                filepath = os.path.join(dirpath, filename)
                try:
                        shutil.rmtree(filepath)
                except OSError:
                        os.remove(filepath)
        return



# utilities 
# Nuova versione, gestisce i tag annidati; x e' la parte "aspecifica" del
# tag di apertura (es: {{ cercando {{Intestazione| )
def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"):
    if side=="right":
        idip=stringa.rfind(idi)
    else:
        idip=stringa.find(idi)
    idfp=stringa.find(idf,idip+len(idi))+len(idf)
    if idip>-1 and idfp>0:
        if x!=None:
            while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                    idfp=stringa.find(idf,idfp)+len(idf)
                
        if dc==0:
            vvalore=stringa[idip+len(idi):idfp-len(idf)]
        else:
            vvalore=stringa[idip:idfp]
    else:
        vvalore=""
    return vvalore

def produci_lista(testo,idi,idf,dc=1,inizio=None):
    t=testo[:]
    lista=[]
    while not find_stringa(t,idi,idf,1,inizio)=="":
        el=find_stringa(t,idi,idf,1,inizio)
        t=t.replace(el,"",1)
        if dc==0:
            el=find_stringa(el,idi,idf,0,inizio)
        lista.append(el)
    return lista

def carica_pcl(nome_file, folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file)
    contenuto=pickle.load(f)
    f.close()
    return contenuto

def salva_pcl(variabile,nome_file="dato",folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file,"w")
    pickle.dump(variabile, f)
    f.close()
    print "Variabile salvata nel file "+nome_file
    return 

def main(djvuBase, step):
        print "Parametri: ",djvuBase, step
        if step=="step1":
                step1(djvuBase)
        if step=="step2":
                step2(djvuBase)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Scarica, modifica e ricarica l'elenco delle parole di un file djvu")

    parser.add_argument('djvuBase', help="nome base del file djvu")

    parser.add_argument('step', help='azione (step1|step2)')
    

    args = parser.parse_args()

    main(args.djvuBase,args.step)