Progetto:Bot/Programmi in Python per i bot/djvutext it.py

Da Wikisource.

Nota: lo script funziona solo se sono installate sullo stesso PC, e rese accessibili mediante modifica del PATH di sistema, le routine dell'applicazione DjvuLibre

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This bot uploads text from djvu files onto pages in the "Page"
namespace.  It is intended to be used for Wikisource.

The following parameters are supported:

    -dry           If given, doesn't do any real changes, but only shows
                   what would have been changed.
    -ask           Ask for confirmation before uploading each page.
                   (Default: ask when overwriting pages)
    -djvu:...      Filename of the djvu file
    -index:...     Name of the index page
                   (Default: the djvu filename)
    -pages:<start>-<end> Page range to upload; <end> is optional

All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.

NOTA: modificato il ns Pagina (108 su it!)
"""
#
# (C) Pywikipedia bot team, 2008-2010
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id: djvutext.py 8051 2010-04-04 15:33:15Z mfarag $'
import wikipedia
import os, sys
import config, codecs

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
}

class DjVuTextBot:
    # Edit summary message that should be used.
    # NOTE: Put a good description here, and add translations, if possible!
    msg = {
        'ar': u'روبوت: إنشاء صفحة بنص مستخرج من DjVu',
        'en': u'Robot: creating page with text extracted from DjVu',
        'fr': u'Bot: Creating page with texte extracted from DjVu',
        'nl': u'Bot: pagina aangemaakt met tekst geëxtraheerd uit DjVu-bestand',
        'pt': u'Bot: criando página com texto extraído do DjVu',
    }
    # On English Wikisource, {{blank page}} is used to track blank pages.
    # It may be omitted by adding an empty string like has been done for 'fr'.
    blank = {
        'en': u'{{blank page}}',
        'fr': u'',
        'pt': u'',
        'it': u''
    }

    def __init__(self, djvu, index, pages, ask=False, debug=False):
        """
        Constructor. Parameters:
        djvu : filename
        index : page name
        pages : page range
        """
        self.djvu = djvu
        self.index = index
        self.pages = pages
        self.dry = debug
        self.ask = ask

    def NoOfImages(self):
        cmd = u"djvused -e 'n' \"%s\"" % (self.djvu)
        count = os.popen( cmd.encode(sys.stdout.encoding) ).readline().rstrip()
        count = int(count)
        wikipedia.output("page count = %d" % count)
        return count

    def PagesGenerator(self):
        start = 1
        end = self.NoOfImages()

        if self.pages:
            pos = self.pages.find('-')
            if pos != -1:
                start = int(self.pages[:pos])
                if pos < len(self.pages)-1:
                    end = int(self.pages[pos+1:])
            else:
                start = int(self.pages)
                end = start
        wikipedia.output(u"Processing pages %d-%d" % (start, end))
        return range(start, end+1)

    def run(self):
        # Set the edit summary message
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg))

        linkingPage = wikipedia.Page(wikipedia.getSite(), self.index)
        self.prefix = linkingPage.titleWithoutNamespace()
        if self.prefix[0:6] == 'Liber:':
            self.prefix = self.prefix[6:]
        wikipedia.output(u"Using prefix %s" % self.prefix)
        gen = self.PagesGenerator()
   
        site = wikipedia.getSite()
        self.username = config.usernames[site.family.name][site.lang]

        for pageno in gen:
            wikipedia.output("Processing page %d" % pageno)
            self.treat(pageno)

    def has_text(self):
        cmd = u"djvudump \"%s\" > \"%s\".out" % (self.djvu, self.djvu)
        os.system ( cmd.encode(sys.stdout.encoding) )
        f = codecs.open(u"%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
        s = f.read()
        f.close()
        return s.find('TXTz') >= 0
      
    def get_page(self, pageno):
        wikipedia.output(unicode("fetching page %d" % (pageno)))
        cmd = u"djvutxt --page=%d \"%s\" \"%s.out\"" % (pageno, self.djvu, self.djvu)
        os.system ( cmd.encode(sys.stdout.encoding) )
        f = codecs.open(u"%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
        djvu_text = f.read()
        f.close()
        return djvu_text

    def elabora_it(text):
        ''' elaborazione testoper it.source

        al momento restituisce il testo tal quale'''
        return text
        

    def treat(self, pageno):
        """
        Loads the given page, does some changes, and saves it.
        """
        site = wikipedia.getSite()
        page_namespace = site.family.namespaces[108][site.lang]  # IMPORTANTE! modifica essenziale allo script originale (usualmente il ns Pagina: è il 104)
        page = wikipedia.Page(site, u'%s:%s/%d' % (page_namespace, self.prefix, pageno) )
        exists = page.exists()

        djvutxt = self.get_page(pageno)

        if not djvutxt:
            djvutxt = wikipedia.translate(wikipedia.getSite(), self.blank)
        else:
            djvutxt=elabora_it(djvutxt)
        text = u'<noinclude>{{PageQuality|1|%s}}<div class="pagetext">\n\n\n</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,djvutxt)

        # convert to wikisyntax
        # this adds a second line feed, which makes a new paragraph
        text = text.replace('¬', "\n")

        # only save if something was changed
        # automatically ask if overwriting an existing page
       
        ask = self.ask
        if exists:
            ask = True
            old_text = page.get()
            if old_text == text:
                wikipedia.output(u"No changes were needed on %s" % page.aslink())
                return
        else:
            old_text = ''

        wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
        wikipedia.showDiff(old_text, text)

        if self.dry:
            wikipedia.inputChoice(u'Dry mode... Press enter to continue', [], [], 'dummy')
            return

        if ask:
            choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
        else:
            choice = 'y'
        if choice == 'y':
            try:
                # Save the page
                page.put_async(text)
            except wikipedia.LockedPage:
                wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
            except wikipedia.EditConflict:
                wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
            except wikipedia.SpamfilterError, error:
                wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))


def main():
    import os
    index = None
    djvu = None
    pages = None
    # what would have been changed.
    dry = False
    ask = False

    # Parse command line arguments
    for arg in wikipedia.handleArgs():
        if arg.startswith("-dry"):
            dry = True
        elif arg.startswith("-ask"):
            ask = True
        elif arg.startswith("-djvu:"):
            djvu = arg[6:]
        elif arg.startswith("-index:"):
            index = arg[7:]
        elif arg.startswith("-pages:"):
            pages = arg[7:]
        else:
            wikipedia.output(u"Unknown argument %s" % arg)

    # Check the djvu file exists
    if djvu:
        os.stat(djvu)

        if not index:
            import os.path
            index = os.path.basename(djvu)

    if djvu and index:
        site = wikipedia.getSite()
        index_page = wikipedia.Page(site, index)

        if site.family.name != 'wikisource':
            raise wikipedia.PageNotFound(u"Found family '%s'; Wikisource required." % site.family.name)

        if not index_page.exists() and index_page.namespace() == 0:
            index_namespace = wikipedia.Page(site, 'MediaWiki:Proofreadpage index namespace').get()

            index_page = wikipedia.Page(wikipedia.getSite(),
                                        u"%s:%s" % (index_namespace, index))

        if not index_page.exists():
            raise wikipedia.NoPage(u"Page '%s' does not exist" % index)

        wikipedia.output(u"uploading text from %s to %s" % (djvu, index_page.aslink()) )

        bot = DjVuTextBot(djvu, index, pages, ask, dry)
        if not bot.has_text():
            raise ValueError("No text layer in djvu file")

        bot.run()
    else:
        wikipedia.showHelp()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()