From 298adef15b6fe22a630dc7b41493feec1f627d9e Mon Sep 17 00:00:00 2001 From: Dustin Cannon Date: Tue, 18 Dec 2012 20:22:33 +0000 Subject: [PATCH 1/2] Updated readability to find the "main image" when it simplifies content * Find the "main image" if possible * Added requirements.txt --- readability/readability.py | 68 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 ++ 2 files changed, 71 insertions(+) create mode 100644 requirements.txt diff --git a/readability/readability.py b/readability/readability.py index 41a023c5..4bd05484 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -2,6 +2,10 @@ import logging import re import sys +import requests + +from PIL import Image +from StringIO import StringIO from collections import defaultdict from lxml.etree import tostring @@ -99,6 +103,68 @@ def __init__(self, input, **options): self.input = input self.options = options self.html = None + self.main_image_dict = {} + + def _get_main_image_dict(self, html_string): + """ Try to find the main image in the given html string """ + min_pix_area = 10000 # 100 * 100 + + # Transform the html string into an lxml tree + doc = build_doc(html_string) + + # Build list of img tags + tags = [] + for t in self.tags(doc, 'img'): + tags.append(t) + + # Get the urls out of the img tags + image_urls = [tag.attrib['src'] for tag in tags] + + # Get actual image data + images = [] + for u in image_urls: + r = requests.get(u) + if r.status_code != 200: + continue + + try: + image_data = Image.open(StringIO(r.content)) + except IOError: + continue + + images.append( + { + 'url': u, + 'size': image_data.size, + 'pix-area': image_data.size[0] * image_data.size[1], + 'object': image_data + }) + + # Filter out images that are not big enough + def big_enough(image_dict): + if image_dict['pix-area'] < min_pix_area: + return False + return True + + images = filter(big_enough, images) + + # If we have no images we return an empty dict + if not images: + return {} + + # If there is only one image then we will use it as the main image + if len(images) == 1: + return images[0] + + # If we make it here then we have more than 1 image. We will return the + # largest image. + largest_pix_area = 0 + largest_image_dict = {} + for i in images: + if i['pix-area'] > largest_pix_area: + largest_image_dict = i + + return largest_image_dict def _html(self, force=False): if force or self.html is None: @@ -176,6 +242,8 @@ def summary(self, html_partial=False): # Loop through and try again. continue else: + # Try to get the main image + self.main_image_dict = self._get_main_image_dict(cleaned_article) return cleaned_article except StandardError, e: log.exception('error getting summary: ') diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..c6b7385c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +lxml +PIL +requests From 0d08bc0558962ec0ae603b452424ef48817ed047 Mon Sep 17 00:00:00 2001 From: Dustin Cannon Date: Tue, 18 Dec 2012 21:28:06 +0000 Subject: [PATCH 2/2] Updated metadata files * Updated VERSIONS and setup.py with new akimbo-specific version number * Updated requirements.txt with specific version numbers --- VERSIONS | 6 ++++++ requirements.txt | 6 +++--- setup.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/VERSIONS b/VERSIONS index 268f16fe..109b3203 100644 --- a/VERSIONS +++ b/VERSIONS @@ -1,3 +1,9 @@ +Changes in VERSION 0.1.02 + Added ability to get "main image" + - Document class now has main_image_dict attribute which is populated + by the summary() method + - Added a requirements.txt + Changes in VERSION 0.1.01 Began customizing Readability -Added several divs to unlikelyCandidatesRe diff --git a/requirements.txt b/requirements.txt index c6b7385c..c1b60c29 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -lxml -PIL -requests +lxml==2.3.3 +PIL==1.1.7 +requests==0.13.3 diff --git a/setup.py b/setup.py index 59851754..7c8a32ed 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="readability-lxml", - version="0.1.01", # Akimbo Specific Version + version="0.1.02", # Akimbo Specific Version author="Yuri Baburov", author_email="burchik@gmail.com", description="fast python port of arc90's readability tool",