WikiTeam · DanielOaks · Oct 22, 2015
diff --git a/dumpgenerator.py b/dumpgenerator.py
@@ -573,19 +573,31 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
     if 'templates' in config and config['templates']:
         params['templates'] = 1
 
-    xml = getXMLPageCore(params=params, config=config, session=session)
-    if xml == "":
-        raise ExportAbortedError(config['index'])
-    if not "</page>" in xml:
-        raise PageMissingError(params['title'], xml)
-    else:
-        # strip these sha1s sums which keep showing up in the export and
-        # which are invalid for the XML schema (they only apply to
-        # revisions)
-        xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
-        xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
+    while True:
+        try:
+            xml = getXMLPageCore(params=params, config=config, session=session)
+            if xml == "":
+                raise ExportAbortedError(config['index'])
+            if "</page>" not in xml:
+                raise PageMissingError(params['title'], xml)
+            else:
+                # do the split before the regexes because .split throws a
+                # MemoryError if it runs out of memory, regexes just kills the
+                # process outright, this lets us download larger pages
+                xml = xml.split("</page>")[0]
+
+                # strip these sha1s sums which keep showing up in the export and
+                # which are invalid for the XML schema (they only apply to
+                # revisions)
+                xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
+                xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
+            break
+        except MemoryError:
+            print "The page's history exceeds our memory, halving limit."
+            params['limit'] = params['limit'] / 2
+            continue
 
-    yield xml.split("</page>")[0]
+    yield xml
 
     # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
     # else, warning about Special:Export truncating large page histories