From 298adef15b6fe22a630dc7b41493feec1f627d9e Mon Sep 17 00:00:00 2001
From: Dustin Cannon <dustin.cannon@akimbo.io>
Date: Tue, 18 Dec 2012 20:22:33 +0000
Subject: [PATCH 1/2] Updated readability to find the "main image" when it
 simplifies content

* Find the "main image" if possible
* Added requirements.txt
---
 readability/readability.py | 68 ++++++++++++++++++++++++++++++++++++++
 requirements.txt           |  3 ++
 2 files changed, 71 insertions(+)
 create mode 100644 requirements.txt

diff --git a/readability/readability.py b/readability/readability.py
index 41a023c5..4bd05484 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -2,6 +2,10 @@
 import logging
 import re
 import sys
+import requests
+
+from PIL import Image
+from StringIO import StringIO
 
 from collections import defaultdict
 from lxml.etree import tostring
@@ -99,6 +103,68 @@ def __init__(self, input, **options):
         self.input = input
         self.options = options
         self.html = None
+        self.main_image_dict = {}
+
+    def _get_main_image_dict(self, html_string):
+        """ Try to find the main image in the given html string """
+        min_pix_area = 10000 # 100 * 100
+
+        # Transform the html string into an lxml tree
+        doc = build_doc(html_string)
+
+        # Build list of img tags
+        tags = []
+        for t in self.tags(doc, 'img'):
+            tags.append(t)
+
+        # Get the urls out of the img tags
+        image_urls = [tag.attrib['src'] for tag in tags]
+
+        # Get actual image data
+        images = []
+        for u in image_urls:
+            r = requests.get(u)
+            if r.status_code != 200:
+                continue
+
+            try:
+                image_data = Image.open(StringIO(r.content))
+            except IOError:
+                continue
+
+            images.append(
+                    {
+                        'url': u,
+                        'size': image_data.size,
+                        'pix-area': image_data.size[0] * image_data.size[1],
+                        'object': image_data
+                    })
+
+        # Filter out images that are not big enough
+        def big_enough(image_dict):
+            if image_dict['pix-area'] < min_pix_area:
+                return False
+            return True
+
+        images = filter(big_enough, images)
+
+        # If we have no images we return an empty dict
+        if not images:
+            return {}
+
+        # If there is only one image then we will use it as the main image
+        if len(images) == 1:
+            return images[0]
+
+        # If we make it here then we have more than 1 image. We will return the
+        # largest image.
+        largest_pix_area = 0
+        largest_image_dict = {}
+        for i in images:
+            if i['pix-area'] > largest_pix_area:
+                largest_image_dict = i
+
+        return largest_image_dict
 
     def _html(self, force=False):
         if force or self.html is None:
@@ -176,6 +242,8 @@ def summary(self, html_partial=False):
                     # Loop through and try again.
                     continue
                 else:
+                    # Try to get the main image
+                    self.main_image_dict = self._get_main_image_dict(cleaned_article)
                     return cleaned_article
         except StandardError, e:
             log.exception('error getting summary: ')
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..c6b7385c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+lxml
+PIL
+requests

From 0d08bc0558962ec0ae603b452424ef48817ed047 Mon Sep 17 00:00:00 2001
From: Dustin Cannon <dustin.cannon@akimbo.io>
Date: Tue, 18 Dec 2012 21:28:06 +0000
Subject: [PATCH 2/2] Updated metadata files

* Updated VERSIONS and setup.py with new akimbo-specific version number
* Updated requirements.txt with specific version numbers
---
 VERSIONS         | 6 ++++++
 requirements.txt | 6 +++---
 setup.py         | 2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/VERSIONS b/VERSIONS
index 268f16fe..109b3203 100644
--- a/VERSIONS
+++ b/VERSIONS
@@ -1,3 +1,9 @@
+Changes in VERSION 0.1.02
+    Added ability to get "main image"
+        - Document class now has main_image_dict attribute which is populated
+          by the summary() method
+        - Added a requirements.txt
+
 Changes in VERSION 0.1.01
     Began customizing Readability
         -Added several divs to unlikelyCandidatesRe
diff --git a/requirements.txt b/requirements.txt
index c6b7385c..c1b60c29 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-lxml
-PIL
-requests
+lxml==2.3.3
+PIL==1.1.7
+requests==0.13.3
diff --git a/setup.py b/setup.py
index 59851754..7c8a32ed 100755
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name="readability-lxml",
-    version="0.1.01", # Akimbo Specific Version
+    version="0.1.02", # Akimbo Specific Version
     author="Yuri Baburov",
     author_email="burchik@gmail.com",
     description="fast python port of arc90's readability tool",