Utente:Alex brollo/alignIt.py

Da Wikisource.
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
# text alignment program
# author : thomasv1 at gmx dot de
# licence : GPL
# copia da https://fr.wikisource.org/wiki/Utilisateur:Phe/Python/align.py (originale anche in Utente:Alex brollo/align.py)
# Richiede: python 2.7, pywikibot, djvutxt
# Test in corso per adattamento a it.wikisource e attivazione del Match da pdf (sarà necessario pdftotxt, libreria xpdf)
# Al momento previsto lancio da Idle (condizione if __name__ == "__main__"  disabilitata)
# Funzione di avvio: do_align(pagename=None,target=None, djvuname=None, number=None) dove:
#     pagename: nome pagina ns0 da matchare
#     target: testo utf-8 già scaricato e copiato nella stessa cartella dello script da matchare (alternativo a pagename)
#     djvuname: nome del file djvu o pdf che deve essere copiato nella stessa cartella dello script
#     number: numero della pagina iniziale di match del file djvu


import os, string, re, sys
import difflib, urllib

# sys.path.append("../pywikipedia")
# import wikipedia, pagegenerators, catlib 
import pywikibot as bot # sostituisce la precedente


#djvutxt = '/home/thomasv/djvulibre/tools/djvutxt'
djvutxt="djvutxt"

def do_match(target, filename, djvuname, number,verbose=False):
    # print target,filename,djvuname,number
    s = difflib.SequenceMatcher()
    offset = 0
    output = u""
    for i in range(1000):

        if i==10 and offset==0:
            print "no text layer"
            return u""

        pagenum=i+number
        comando=djvutxt + " --page=%d %s"% (pagenum,filename)
        p = os.popen(comando)
        page1 = unicode(p.read(), 'utf-8')
        
        p.close()
        p = os.popen(djvutxt + " --page=%d %s "%((pagenum+1),filename))
        page2 = unicode(p.read(), 'utf-8')
        
        p.close()
        p = os.popen(djvutxt + " --page=%d %s "%((pagenum+2),filename))
        page3 = unicode(p.read(), 'utf-8')
        
        p.close()
        text1 = page1+page2+page3
        text2 = target[offset:offset+ int(1.5*len(text1))]
        p = re.compile(ur'[\W]+', re.U)
        fp = re.compile(ur'([\W]+)', re.U)
        ftext1 = fp.split(text1)
        ftext2 = fp.split(text2)

        page1 = p.split(page1)
        text1 = p.split(text1)
        text2 = p.split(text2)
        s.set_seqs(text1,text2)

        mb = s.get_matching_blocks()
        # if aggiunto
        if len(mb) < 2: # nessun match
            print "LEN(MB) < 2, breaking"
            break
        ccc = mb[-2]
        dummy = mb[-1]
        ratio = s.ratio()
        print i, ccc, ratio

        if ratio<0.1:
            print "low ratio"
            break
        mstr=u""
        overflow = False
        for i in range(ccc[0]+ccc[2]):
            matched = False
            for m in mb:
                if i >= m[0] and i < m[0]+m[2] :
                   matched = True
                   if i >= len(page1):
                       overflow = True
                   break
            if not overflow:
                ss = ftext1[2*i]
                if matched : ss =u"\033[1;32m%s\033[0;49m"%ss
                if 2*i+1 < len(ftext1):
                    mstr = mstr + ss +ftext1[2*i+1]
        if verbose:
            print mstr
            print "--------------------------------"

        mstr=u""
        no_color = u""
        overflow = False
        for i in range(ccc[1]+ccc[2]):
            matched = False
            for m in mb:
                if i >= m[1] and i < m[1]+m[2] :
                   matched = True
                   if m[0]+i-m[1] >= len(page1):
                       overflow = True
                   break

            if not overflow:
                ss = ftext2[2*i]
                if matched : ss =u"\033[1;31m%s\033[0;49m"%ss
                if 2*i+1 < len(ftext2):
                    mstr = mstr + ss +ftext2[2*i+1]
                    no_color = no_color + ftext2[2*i] + ftext2[2*i+1]
        if verbose:
            print mstr
            print "===================================="

        output = output + u"\n==[[Page:%s/%d]]==\n"%(djvuname,pagenum) + no_color
        offset = offset + len(no_color)

    if offset!=0 and target[offset:]:
        output = output+u"\n== reste ==\n" + target[offset:]

    if offset==0:
        output = u""

    return output

def get_djvu_filename(filename):
    filename = filename.replace(' ', '_')
    #return 'RDDM/' + filename
    return filename

def do_align(pagename=None,target=None, djvuname=None, number=None):
    #site = bot.getSite('fr',fam='wikisource')
    if pagename != None:
        site = bot.getSite('it',fam='wikisource')
        bot.setAction("pagination")

        page = bot.Page(site, pagename)
        old_target = target = page.get()
    elif target!=None:
        old_target=target=unicode(open(target).read(),"utf-8")
    if number == None:
        # FIXME
        pass

    #target = re.sub(u'{{TextQuality\|50%}}<div class="text">\n', '', target)
    #target = re.sub(u'<references/>\n</div>', u'', target)
    match = re.match(u'({{journal\|[^}]*}}\n)', target)
    target = re.sub(u'{{journal\|[^}]*}}\n', u'', target)

    # !!!!!!! inserire ripulimento monnezza ns0

    filename = get_djvu_filename(djvuname)
    output = do_match(target, filename, djvuname, number)
    if output==False:
        return
    if match:
        output = match.group(1) + output
    if output:
        # bot.showDiff(old_target, output)
        choice = bot.inputChoice(u'Upload ?', [ 'Yes', 'No' ], [ 'Y', 'N'], 'N')
        if choice == 'Y' or choice == 'y':
            page.put(output)
            print "Salvato in pagina ",page.title()
        else:
            open("output.txt","w").write(output.encode("utf-8"))
            print "Salvato output.txt"
    return
##if __name__ == "__main__":
##    try:
##        page_number = None
##        if len(sys.argv) > 3:
##            page_number = int(sys.argv[3])
##        do_align(unicode(sys.argv[1], 'utf-8'), sys.argv[2], page_number)
##        #do_align(u"Etudes sur l'Angleterre : les classes inférieures", "Revue des Deux Mondes - 1845 - tome 11.djvu", 35)
##    finally:
##        bot.stopme()