Wikipedi kategorisindeki makaleleri almak - HTML

Dünkü yazıda, bir wikipedia kategorisindeki makaleleri wikitex formatında almayı göstermiştim. Bu yazıda ise, HTML olarak alacağız.

# -*- coding: utf-8 -*-
import os
import errno
from urllib import quote
import requests
import json
from HTMLParser import HTMLParser
from sys import version
# Fill in please
your_name = "Yaşar Arabacı"
email = ""
useragent = "Export a category -- Python %s, %s %s" % (version, your_name, email)
category = "Kategori:Fizik"
dirname = quote(category)
h = HTMLParser()
template = "{title}{content}"
# Create target directory
except OSError as e:
    if e.errno != errno.EEXIST:
class WikiError(Exception):
def process_pages(pages):
    for i, page in pages.items():
        print page["title"]
        filename = "%s.html" % quote(page["title"].encode("utf-8"))
        html = h.unescape(page["revisions"][0]["*"])
        rendered = template.format(title=page["title"].encode("utf-8"), content=html.encode("utf-8"))
        with open(os.path.join(dirname,filename), "w") as f:
def get(action,**kwargs):
    endpoint = ""
    headers = {"User-Agent":useragent}
    r = requests.get(endpoint, headers=headers, params=kwargs)
    print r.url
    js = json.loads(r.text)
    if "error" in js:
        raise WikiError(js["error"]["info"])
        return js
# Params that gives you content off all pages in a given category
params = {
    "generator" : "categorymembers",
    "gcmtitle" : category,
    "gcmtype": "page",
    "prop" : "revisions",
    "rvprop" : "content",
    "rvparse" : 1
a = get("query",**params)
while "query-continue" in a:
    nparams = dict(params)
    a = get("query",**nparams)
print "done"