Skip to content

Commit

Permalink
Merge pull request akimboio#4 from akimboio/develop
Browse files Browse the repository at this point in the history
Updated readability to find the "main image" when it simplifies content
  • Loading branch information
dustincannon committed Dec 18, 2012
2 parents 4a08686 + 490c35d commit e2d2868
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 1 deletion.
6 changes: 6 additions & 0 deletions VERSIONS
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
Changes in VERSION 0.1.02
Added ability to get "main image"
- Document class now has main_image_dict attribute which is populated
by the summary() method
- Added a requirements.txt

Changes in VERSION 0.1.01
Began customizing Readability
-Added several divs to unlikelyCandidatesRe
Expand Down
68 changes: 68 additions & 0 deletions readability/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
import logging
import re
import sys
import requests

from PIL import Image
from StringIO import StringIO

from collections import defaultdict
from lxml.etree import tostring
Expand Down Expand Up @@ -99,6 +103,68 @@ def __init__(self, input, **options):
self.input = input
self.options = options
self.html = None
self.main_image_dict = {}

def _get_main_image_dict(self, html_string):
""" Try to find the main image in the given html string """
min_pix_area = 10000 # 100 * 100

# Transform the html string into an lxml tree
doc = build_doc(html_string)

# Build list of img tags
tags = []
for t in self.tags(doc, 'img'):
tags.append(t)

# Get the urls out of the img tags
image_urls = [tag.attrib['src'] for tag in tags]

# Get actual image data
images = []
for u in image_urls:
r = requests.get(u)
if r.status_code != 200:
continue

try:
image_data = Image.open(StringIO(r.content))
except IOError:
continue

images.append(
{
'url': u,
'size': image_data.size,
'pix-area': image_data.size[0] * image_data.size[1],
'object': image_data
})

# Filter out images that are not big enough
def big_enough(image_dict):
if image_dict['pix-area'] < min_pix_area:
return False
return True

images = filter(big_enough, images)

# If we have no images we return an empty dict
if not images:
return {}

# If there is only one image then we will use it as the main image
if len(images) == 1:
return images[0]

# If we make it here then we have more than 1 image. We will return the
# largest image.
largest_pix_area = 0
largest_image_dict = {}
for i in images:
if i['pix-area'] > largest_pix_area:
largest_image_dict = i

return largest_image_dict

def _html(self, force=False):
if force or self.html is None:
Expand Down Expand Up @@ -176,6 +242,8 @@ def summary(self, html_partial=False):
# Loop through and try again.
continue
else:
# Try to get the main image
self.main_image_dict = self._get_main_image_dict(cleaned_article)
return cleaned_article
except StandardError, e:
log.exception('error getting summary: ')
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
lxml==2.3.3
PIL==1.1.7
requests==0.13.3
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name="readability-lxml",
version="0.1.01", # Akimbo Specific Version
version="0.1.02", # Akimbo Specific Version
author="Yuri Baburov",
author_email="[email protected]",
description="fast python port of arc90's readability tool",
Expand Down

0 comments on commit e2d2868

Please sign in to comment.