Utente:Alfiobot/bot.py

(la formattazione viene male perché io uso i tab a 4 spazi mentre qui pare che siano fissi ad 8).
#!/usr/bin/python
#
# Wrapper around the pywikipediabot framework

import os, re, time, sys
import wikipedia, catlib, pagegenerators
	
# Logins to wikipedia server
def login( password):
	os.system("./login.py -pass:"+password)

# Logouts from wikipedia server
def logout():
	os.unlink("login-data/wikipedia-it-Alfiobot-login.data")
	print "Logged out"

# Sets the action text
def setaction(text):
	wikipedia.setAction(text)
	
# Returns a page object for the given page title
def getpage(title):
	return wikipedia.Page(wikipedia.getSite(), title)
	
# Returns a list of links in the specified page, using Page objects
def list_links( title):
	site = wikipedia.getSite()
	return wikipedia.Page(site, pagetitle).linkedPages()
	
# Returns a list of pages in the specified category, using Page objects
def list_cat_pages( title):
	site = wikipedia.getSite()
	return catlib.Category(site, title).articles()
	

# Checks whether a page object is in the specified category.
# Returns True if so, False otherwise
def isincat( page, catTitle):

	if not isinstance(page, wikipedia.Page):
		page = getpage(page)

	site = wikipedia.getSite()
	cat_namespace = site.category_namespaces()[0]
	catTitle = catTitle[:1].upper() + catTitle[1:]

	cats = page.categories()
	catpl = wikipedia.Page(site, cat_namespace + ':' + catTitle)
	return catpl in cats

# Checks whether the specified category is surrounded by <noinclude> tags
def is_cat_noinclude( page, catTitle):

	if not isinstance(page, wikipedia.Page):
		page = getpage(page)

	text = page.get()
	
	return re.search("<noinclude>.*?\[\["+wikipedia.getSite().category_namespace()+"\:"+catTitle+"\]\].*?</noinclude>", text)
	

# Adds a category to the specified page object
# (copy and paste from category.py with small changes)

def addcat( page, catTitle, noinclude = False):
	
	if not isinstance(page, wikipedia.Page):
		page = getpage(page)

	site = wikipedia.getSite()
	cat_namespace = site.category_namespaces()[0]
	catTitle = catTitle[:1].upper() + catTitle[1:]

	try:
		cats = page.categories()
		catsWithSortKeys = page.categories(withSortKeys = True)
	except wikipedia.NoPage:
		wikipedia.output(u"%s doesn't exist yet. Ignoring." % (page.title()))
		pass
	except wikipedia.IsRedirectPage,arg:
		redirTarget = wikipedia.Page(site,arg.args[0])
		wikipedia.output(u"WARNING: %s is redirect to %s. Ignoring." % (page.title(), redirTarget.title()))
	else:
		wikipedia.output(u"Current categories:")
		for cat in cats:
			wikipedia.output(u"* %s" % cat.title())
		catpl = wikipedia.Page(site, cat_namespace + ':' + catTitle)
		if catpl in cats:
			wikipedia.output(u"%s is already in %s." % (page.title(), catpl.title()))
			return True
		else:
			wikipedia.output(u'Adding %s' % catpl.aslink())
			catsWithSortKeys.append(catpl)
			text = page.get()
			text = wikipedia.replaceCategoryLinks(text, catsWithSortKeys)
			if noinclude:
				cat_ns = wikipedia.getSite().category_namespace()
				text = text.replace("[["+cat_ns+":"+catTitle+"]]", "<noinclude>[["+cat_ns+":"+catTitle+"]]</noinclude>")
			page.put(text)
			return False
		
# Removes a category from an article, without touching any other text
def removecat( page, catTitle):
	replace( page, [('\[\['+wikipedia.getSite().category_namespace()+'\:'+catTitle+'.*?\]\]', '')])

# Sends a page object contents as the new page. It only accepts
# page objects and not titles, otherwise it would send an empty page
def sendpage( page):
	page.put( page._contents )
		
# Appends text to an article
def append( page, text, verbose = False, send = True):
	replace( page, [("$", text)], verbose = verbose, send = send)
	
# Replaces text inside a page using regular expressions or text substitutions
# <replace_list> is a list of tuples, each with two members:
# - regular expression to match
# - replacement text
#
# Each tuple will be applied to the text, in order
#
# Returns True if some substitutions have been made, False otherwise

def replace( page, replace_list, use_regexp = True, verbose = False, send = True, interactive = False, exclude = None):
	
	# If a title was passed, get the corresponding page
	if not isinstance(page, wikipedia.Page):
		page = getpage(page)
		
	text = page.get()
	old_text = text
	
	if exclude:
		[text, removed] = tearout( text, exclude)

	for regexp, newtext in replace_list:
		if use_regexp:
			text = re.sub('(?u)'+regexp, newtext, text)
		else:
			text = text.replace( regexp, newtext)
	
	if exclude:
		text = placein( text, removed)			
	
	if text != old_text:
		if interactive:	
			wikipedia.output(u'>>> %s <<<' % page.title())
			
			wikipedia.showDiff(old_text, text)
			choice = wikipedia.inputChoice(u'Accetti i cambiamenti?',  ['Yes', 'No'], ['y', 'N'], 'N')
			if choice not in ['y', 'Y']:
				return

		if verbose:
			log( u'Replacing text in ', onscreen = True, newline= False)
			log( page.aslink(), onscreen = True)
		if send:
			page.put(text)

		# Update local copy anyway
		page._contents = text
		return True
	else:
		wikipedia.output(u'No changes in page %s' % page.aslink())
		if verbose:
			log( u'No changes necessary in', onscreen = True, newline = False)
			log( page.aslink(), onscreen = True)
		return False

# Removes any text matching a list of delimiter pairs, and places an unique marker in its place
# Returns two values: the modified text and a dictionary with marker/removed text couples

def tearout( text, delimiters):

	removed = {}
	for start,end in delimiters:
		while True:
			pos1 = text.find(start)
			pos2 = text.find(end)
			if 0 <= pos1 < pos2:
				fragment = text[pos1:pos2+len(end)+1]
				marker = unique(text, "MARKER%dMARKER")
				text = text[:pos1] + marker + text[pos2+len(end)+1:]
				removed[marker] = fragment
			else:
				break

	return [text, removed]

# Places back in the text removed by tearout()
def placein( text, removed):
	
	for marker, val in removed.items():
		text = text.replace(marker, val)
	
	return text

# Returns an unique identifier (one that is not already present in the string)

def unique( text, marker):

	counter=0
	while True:
		m = marker % counter
		if m not in text:
			return m
		counter += 1
	
# Open/close/write log
	
def openlog(task=''):
	global logfile
	logfile = file("logs/%s" % task, 'a')
	log( "\n"+task.upper()+" run started "+time.asctime(time.gmtime())+" (UTC)")

def closelog(task=''):
	log( task.upper()+" run ended "+time.asctime(time.gmtime())+" (UTC)")
	logfile.close()

def log(text='', newline = True, onscreen = False, page = None):
	
	if page:
		log( page.aslink()+': ', newline = False)
		
	try:
		logfile.write(text)
	except UnicodeEncodeError:
		logfile.write("(utf-8 error)")
		
	if onscreen:
		try:
			wikipedia.output(text)
		except UnicodeEncodeError:
			wikipedia.output(u'(utf-8 error)')
		
	if newline:
		logfile.write('\n')
		

# General exception handler - does nothing except a clean exit on a Ctrl-C
def handle_exceptions():

	exc_info = sys.exc_info()
	
	if str(exc_info[0]) == 'exceptions.KeyboardInterrupt':
		sys.exit()
	
	log( "Error: "+str(exc_info[0]))
	
	
# Starts a page generator and executes the given function on each page

def do_generator( generator, function, preload = False):

	if preload:
		generator2 = pagegenerators.PreloadingGenerator(generator, pageNumber = 100)
	else:
		generator2 = generator
		
	for page in generator2:
		try:
			page.get()
		except wikipedia.IsRedirectPage, wikipedia.NoPage:
			print "Skipping redirect"
			continue
		function(page)


class XmlDumpPageGenerator:
    """
    Generator which will yield pages from an xml file
    The yielded object is a class with the following attributes:

        * title
        * id
        * text
        * timestamp
        * editRestriction
        * moveRestriction

    Arguments:
        * xmlfilename  - The dump's path, either absolute or relative
    """

    def __init__(self, xmlfilename):
        self.xmlfilename = xmlfilename
    
    def __iter__(self):
        import xmlreader
        mysite = wikipedia.getSite()
        dump = xmlreader.XmlDump(self.xmlfilename)
        for entry in dump.parse():
        	yield entry