From Botwiki
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
That's a simple script (if you have start to program in python
I suggest youto take a look in it) that simply save in
a .txt file or print in the DOS screen the HTML Code of a website's page.
It can be useful if you need to take parameters or info from a log or a
similar wikipedia's page.
You can use the following parameters:
-url To set what url get (default: http://en.wikipedia.org/wiki/Main_Page)
-print To set if you want that the program print in the DOS screen instead of
write the output in a .txt file (print.txt) (default: save the file)
This script is take from: http://botwiki.sno.cc/
"""
#
#
# (C) Filnik, 2007
#
# Distributed under the terms of the MIT license.
#
# Version: 1.0
#
import wikipedia, urllib2
import time, config
# Some default parameters
url = "http://en.wikipedia.org/wiki/Main_Page"
printme = False
# That's a block where you define what parameters use and what
# each parameter will do ^__^
for arg in wikipedia.handleArgs():
if arg.startswith('-url'):
start = True
if len(arg) == 4:
url = str(wikipedia.input(u'What url do you need to get?'))
else:
url = str(arg[5:])
elif arg == '-print':
printme = True
# If the url hasn't the http:// before, it will crash. In this way i prevent
# this bug ^_-
if 'http://' in url:
pass
else:
url = 'http://' + url
# That's a function, i use it to get the text from the url
def pageText(url):
try:
request = urllib2.Request(url)
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7'
request.add_header("User-Agent", user_agent)
response = urllib2.urlopen(request)
text = response.read().decode(config.console_encoding)
response.close()
# When you load to many users, urllib2 can give this error.
except urllib2.HTTPError:
wikipedia.output(u"Server error. Pausing for 10 seconds before continuing. " + time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime()))
time.sleep(10)
request = urllib2.Request(url)
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7'
request.add_header("User-Agent", user_agent)
response = urllib2.urlopen(request)
text = response.read().decode(config.console_encoding)
response.close()
return text
# That's the main block, where there's a finally to permit to the bot to drop the processes
# before stop.
try:
if printme == False:
rock = file('print.txt', 'w')
rock.write(pageText(url).encode(config.console_encoding))
else:
wikipedia.output('u%s' % pageText(url).encode(config.console_encoding))
wikipedia.output('Done!')
finally:
wikipedia.stopme()