1 contributor
#!/usr/bin/env python
from urlparse import urlparse
from bs4 import BeautifulSoup
import feedparser
import base64
import pprint
import urllib2
import lxml.html
import re
import sys
import getopt
from slimmer import html_slimmer
import weasyprint
MAX_ARTICLES=20
verbose = False
pdf = False
output_filename = 'default.html'
rss_url = 'http://www.lemonde.fr/rss/une.xml'
options, remainder = getopt.getopt(sys.argv[1:], 'o:v', ['output=',
'verbose',
'pdf',
'url=',
])
for opt, arg in options:
if opt in ('-o', '--output'):
output_filename = arg
elif opt in ('-u', '--url'):
rss_url = arg
elif opt in ('-v', '--verbose'):
verbose = True
elif opt == '--version':
version = arg
elif opt == '--pdf':
pdf = True
#~ print pdf
if pdf == False :
CSS="h1,h2{font-weight:700}img,ul{width:440px;padding:0}em,img{text-align:left;align:left}img{height:208px}#nav-next:hover,#nav-prev:hover,a:hover{background:#333}body{color:#000;font-family:Helvetica Neue,Helvetica,Arial,sans-serif;background-color:#f0f0f0}h1{font-size:1.5rem;line-height:1.5rem}h2,h3{font-size:1rem;line-height:1rem}details,h3{font-weight:400;font-style:italic}h3{background-color:#cdcdcd}details{font-family:TradeGothic,Helvetica Neue,Helvetica,Arial,sans-serif;font-size:.5rem}ul{list-style-type:none;color:#00F}ul:hover{cursor:pointer;cursor:hand}figure{margin-left:0;text-align:center}.img-heading,.img-nav{width:50px}#nav-next,#nav-prev,#nav-up{font-size:200%;font-weight:700;color:#00f}#nav-source{font-size:100%;font-weight:700;color:#00f}#article,#article-current{width:440px}.pullquote{padding:.5rem 1.5rem 0;font:700 1em/.8em TradeGothic,Helvetica Neue,Helvetica,Arial,sans-serif;position:relative;margin-bottom:1.5rem;z-index:1}.pullquote:after,.pullquote:before{color:silver;position:absolute;content:'"';font-size:5em;height:.5rem;line-height:.75em;top:0;left:-.07em;z-index:-1}.pullquote:after{content:'"';top:auto;bottom:0;left:auto;right:0;line-height:.36em}a{text-decoration:none}a:link,a:visited{color:#00F}"
else:
CSS="h1,h2{font-weight:700}img,ul{width:440px;padding:0}em,img{text-align:left;align:left}img{height:208px}#nav-next:hover,#nav-prev:hover,a:hover{background:#333}body{color:#000;font-family:Helvetica Neue,Helvetica,Arial,sans-serif;background-color:#fff}h1{font-size:1.5rem;line-height:1.5rem}h2,h3{font-size:1rem;line-height:1rem}details,h3{font-weight:400;font-style:italic}h3{background-color:#cdcdcd}details{font-family:TradeGothic,Helvetica Neue,Helvetica,Arial,sans-serif;font-size:.5rem}ul{list-style-type:none;color:#00F}ul:hover{cursor:pointer;cursor:hand}figure{margin-left:0;text-align:center}.img-heading,.img-nav{width:50px}#nav-next,#nav-prev,#nav-up{font-size:200%;font-weight:700;color:#00f}#nav-source{font-size:100%;font-weight:700;color:#00f}#article,#article-current{width:440px}.pullquote{padding:.5rem 1.5rem 0;font:700 1em/.8em TradeGothic,Helvetica Neue,Helvetica,Arial,sans-serif;position:relative;margin-bottom:1.5rem;z-index:1}.pullquote:after,.pullquote:before{color:silver;position:absolute;content:'"';font-size:5em;height:.5rem;line-height:.75em;top:0;left:-.07em;z-index:-1}.pullquote:after{content:'"';top:auto;bottom:0;left:auto;right:0;line-height:.36em}a{text-decoration:none}a:link,a:visited{color:#00F}"
class Printable:
def __repr__(self):
from pprint import pformat
return "<" + type(self).__name__ + "> " + pformat(vars(self), indent=4, width=1)
class FeedDetails(Printable):
num = 0
title = ""
link = ""
subtitle = ""
def debug_print(self):
print("FeedDetails:num :{:d}".format(self.num))
print("FeedDetails:title :{:s}".format(self.title))
print("FeedDetails:link :{:s}".format(self.link))
print("FeedDetails:subtitle:{:s}".format(self.subtitle))
class ArticleDetails(Printable):
title = ""
link = ""
summary = ""
content = ""
enclosure = ""
content_only = ""
def debug_print(self):
print("ArticleDetails:title :{:s}".format(self.title))
print("ArticleDetails:link :{:s}".format(self.link))
print("ArticleDetails:summary :{:s}".format(self.summary))
print("ArticleDetails:enclosure :{:s}".format(self.enclosure))
print("ArticleDetails:content_only :{:s}".format(self.content))
print("ArticleDetails:content : ================")
def process_match(m):
# Process the match here.
return ''
debug=1
feed_details=FeedDetails()
print("-- rss url : {:s}".format(rss_url))
parsed_uri = urlparse( rss_url )
domain_url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
print("-- domain url : {:s}".format(domain_url))
d = feedparser.parse(rss_url)
feed_details.title=d['feed']['title'].encode('utf-8').strip()
feed_details.link=d['feed']['link'].encode('utf-8').strip()
feed_details.subtitle=d['feed']['subtitle'].encode('utf-8').strip()
feed_details.num = len(d['entries'])
f = open(output_filename, 'w')
f.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\n")
f.write("\thttp://www.w3.org/TR/html4/loose.dtd\">\n")
f.write("<html>\n")
f.write("<head>\n")
f.write(" <title>"+feed_details.title+"</title>\n")
f.write(" <meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\">\n")
f.write(" <meta name=\"viewport\" content=\"width=450px, user-scalable=no\">\n")
f.write(" <link rel=\"icon\" type=\"image/ico\" href=\"favicon.ico\">\n")
if pdf is False:
f.write(" <script>\n")
f.write(" function onArticle(index) {\n")
f.write(" var string_index = \"article-\"+index;\n")
f.write(" var url = location.href;\n")
f.write(" document.getElementById(\"article-current\").innerHTML =\n")
f.write(" document.getElementById(string_index).innerHTML;\n")
f.write(" location.href = \"#article-top\";\n")
f.write(" }\n")
f.write(" </script>\n")
f.write("<style>\n"+CSS+"\n</style>\n")
f.write("</head>\n")
f.write("<body>\n")
f.write("<h1 id=\"top\">"+feed_details.title+"</h1>\n")
articles=list()
cpt=0
for article in d.entries:
article_details = ArticleDetails()
article_details.title = article.title.encode('utf-8').strip()
article_details.link = article.link.encode('utf-8').strip("?xtor=RSS-3208")
#~ article_details.link = urlparse(article.link.encode('utf-8').strip(),allow_fragments=F).geturl()
article_details.summary = article.summary.encode('utf-8').strip()
for link in article.links:
if "enclosure" == link.rel:
article_details.enclosure = link.href
if article_details.enclosure == "":
soup_mysite = BeautifulSoup(article.description,"lxml")
content = soup_mysite.find("img")
article_details.enclosure = content.get('src')
# Not Working as is. Generated image is too big
#if article_details.enclosure is not None:
#img_content=urllib2.urlopen(article.link).read()
#article_details.enclosure = "data:image/jpg;base64,"+base64.b64encode(img_content)
if pdf is False:
f.write("<ul><div onclick=\"onArticle("+str(cpt)+")\" style=\"display:inline;\">\n")
f.write("\t<img src=\""+article_details.enclosure+"\" style=\"display:inline;\"><br>\n")
else:
f.write("<ul><div style=\"display:inline;\"><a href=\"#article-"+str(cpt)+"\">\n")
f.write("\t<img src=\""+article_details.enclosure+"\" style=\"display:inline;\"></a><br>\n")
f.write("\t<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">⇞</a></div>\n")
if pdf is False:
f.write("\t<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">↧</a></div>\n")
f.write("\t"+article_details.title+"</div></ul>\n")
else:
f.write("\t<a href=\"#article-"+str(cpt)+"\">"+article_details.title+"</a></div></ul>\n")
articles.append(article_details)
cpt=cpt+1
if cpt > MAX_ARTICLES:
break
cpt_num=cpt
if pdf is False:
f.write("\n<a name=\"article-top\"></a>\n")
f.write("<div id=\"article-current\"></div>\n\n")
cpt=0
for article in articles:
print("-- {:d} : {:s}".format(cpt,article.title))
opener = urllib2.build_opener()
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0')]
try:
response = opener.open(article.link)
article.content = response.read()
except:
print(" -- {:d}".format(response.code))
article.content = None
if None == article.content:
cpt_prev=cpt-1
if cpt_prev < 0:
cpt_prev = 0
cpt_next=cpt+1
if cpt_next > cpt_num:
cpt_next = cpt_num
if pdf is False:
f.write("<div class=\"article\" id=\"article-"+str(cpt)+"\" style=\"display: none;\">\n")
elif pdf is True:
print "flat"
f.write("<div class=\"article\" id=\"article-"+str(cpt)+"\" style=\"display: inline;\">\n")
f.write("<hr><a name=\"article-"+str(cpt)+"\">")
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">⇞</a></div>\n")
if pdf is False:
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">↥</a></div> \n")
f.write("<div id=\"nav-source\" style=\"display:inline;\"><a href=\""+article.link+"\" target=\"new-"+str(cpt)+"\">source</a></div> ")
if pdf is False:
f.write("<div id=\"nav-prev\" onclick=\"onArticle("+str(cpt_prev)+")\" style=\"display:inline;\">↤</div>\n")
f.write("<div id=\"nav-next\" onclick=\"onArticle("+str(cpt_next)+")\" style=\"display:inline;\">↦</div>\n")
else:
f.write("<div id=\"nav-prev\" style=\"display:inline;\"><a href=\"#article-"+str(cpt_prev)+"\">↤</a></div>\n")
f.write("<div id=\"nav-next\" style=\"display:inline;\"><a href=\"#article-"+str(cpt_next)+"\">↦</a></div>\n")
f.write("<div class=\"extract-content\" id=\""+str(cpt)+"\">\n")
f.write(article.content_only)
f.write("\n</div>\n")
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">⇞</a></div>\n")
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">↥</a></div> \n")
f.write("<div id=\"nav-source\" style=\"display:inline;\"><a href=\""+article.link+"\" target=\"new-"+str(cpt)+"\">source</a></div> ")
if pdf is False:
f.write("<div id=\"nav-prev\" onclick=\"onArticle("+str(cpt_prev)+")\" style=\"display:inline;\">↤</div>\n")
f.write("<div id=\"nav-next\" onclick=\"onArticle("+str(cpt_next)+")\" style=\"display:inline;\">↦</div>\n")
else:
f.write("<div id=\"nav-prev\" style=\"display:inline;\"><a href=\"#article-"+str(cpt_prev)+"\">↤</a></div>\n")
f.write("<div id=\"nav-next\" style=\"display:inline;\"><a href=\"#article-"+str(cpt_next)+"\">↦</a></div>\n")
f.write("</div>\n\n")
cpt=cpt+1
continue
soup_mysite = BeautifulSoup(article.content,"lxml")
content = soup_mysite.find("article")
if content == None:
content = soup_mysite.find('div', attrs={'class':'article-page'})
if content == None:
only_text=soup_mysite.find('div', attrs={'id':'the-content'})
content = "<h1>{:s}</h1><h3>{:s}</h3>{:s}".format(article.title,article.summary,only_text)
if rss_url == "http://www.numerama.com/rss/news.rss":
content = "<h1>{:s}</h1>{:s}".format(article.title,content)
article.content_only = str(content)
#~ article.content_only = article.content_only.replace(" href=\"/", " href=\"http://www.lemonde.fr/")
new_full_domain = " href=\"{:s}".format(domain_url)
#~ article.content_only = article.content_only.replace(" href=\"/", " href=\"http://www.lemonde.fr/")
article.content_only = article.content_only.replace(" href=\"/", new_full_domain)
article.content_only = article.content_only.replace('<script>require(["twitter/widgets"]);</script>','')
article.content_only = article.content_only.replace('<script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script></div>','')
article.content_only = article.content_only.replace('<div class="toolbar"></div>','')
article.content_only = article.content_only.replace('<figure class="illustration_haut " style="width: 534px">','<figure>')
article.content_only = article.content_only.replace('<figure class="illustration_haut">','<figure>')
article.content_only = article.content_only.replace('<figure class="illustration_haut " style="width: 534px">','<figure>')
article.content_only = article.content_only.replace('<span id="publisher" itemprop="Publisher" data-source="LE MONDE">Le Monde</span>','Le Monde')
article.content_only = article.content_only.replace(' data-lazyload="false" ',' ')
article.content_only = article.content_only.replace('Par<span>','Par <span>')
article.content_only = article.content_only.replace('<span class="source_image" ','<span ')
article.content_only = article.content_only.replace('<div class="bg_gris_moyen signature">','<div>')
article.content_only = article.content_only.replace('<p itemprop="author" class="auteur txt2_120">','<p>')
article.content_only = article.content_only.replace('<li class="clearfix" itemprop="author">','<li>')
article.content_only = article.content_only.replace('<img width="534"','<img width="400"')
article.content_only = article.content_only.replace('onclick="return false;" ','')
article.content_only = article.content_only.replace('target="_blank"','')
article.content_only = article.content_only.replace('class="lien_interne rub"','')
article.content_only = article.content_only.replace('<a class="auteur" target="_blank" ','<a target="_blank" ')
article.content_only = article.content_only.replace('<a class="lien_interne rub" ','<a target="_blank" ')
article.content_only = article.content_only.replace('<a target=\'_blank\' onclick=\'return false;\' class=\'lien_interne conjug\' ','<a target="_blank" ')
article.content_only = article.content_only.replace('<h1 class="tt2" itemprop="Headline">','<h1>')
article.content_only = article.content_only.replace('<h2 class="taille_courante">','<h2>')
article.content_only = article.content_only.replace('<h2 class="intertitre">','<h2>')
article.content_only = article.content_only.replace('<h2 class="taille_courante">','<h2>')
article.content_only = article.content_only.replace('<p class="lire">','<p>')
article.content_only = article.content_only.replace('<p class="lire marqueur_restreint_atome">','<p>')
article.content_only = article.content_only.replace('<p class="bloc_signature">','<p>')
article.content_only = article.content_only.replace('<strong class="txt1 txt_gris_moyen">Suivre</strong>','')
article.content_only = article.content_only.replace('<br><span class="txt_gris_clair">Journaliste au Monde</span>','Journaliste au Monde')
article.content_only = article.content_only.replace('<img width="534"','<img width="400"')
article.content_only = article.content_only.replace('<!-- atome snippet -->','')
article.content_only = article.content_only.replace('?syndication=131181','')
article.content_only = article.content_only.replace('<iframe width="544" height="306" ','<iframe width="440" height="248" ')
article.content_only = article.content_only.replace(' frameborder="0" width="534" height="320">',' frameborder="0" width="440" height="264">')
article.content_only = article.content_only.replace(' onload="lmd.pic(this);" onerror="lmd.pic(this);" class="lazy-retina">','>')
article.content_only = article.content_only.replace('<figcaption class="legende" data-caption="','<br><em>')
article.content_only = article.content_only.replace('<figcaption class="legende">','<br><em>')
article.content_only = article.content_only.replace('</figcaption>','</em>')
article.content_only = article.content_only.replace('\\n','')
#~ article.content_only = article.content_only.replace('','')
#~ article.content_only = article.content_only.replace('','')
#~ article.content_only = article.content_only.replace('','')
#~ article.content_only = article.content_only.replace('','')
#~ article.content_only = article.content_only.replace('','')
#~ article.content_only = article.content_only.replace('','')
#~ article.content_only = article.content_only.replace('','')
#~ article.content_only = article.content_only.replace('','')
#~
# Cleanup
regexConjug = re.compile(r'<a class=\"lien_interne conjug\".*?>(.+?)</a>')
article.content_only = regexConjug.sub('\\1',article.content_only)
# Diet
article.content_only = html_slimmer(article.content_only.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
cpt_prev=cpt-1
if cpt_prev < 0:
cpt_prev = 0
cpt_next=cpt+1
if cpt_next > cpt_num:
cpt_next = cpt_num
if pdf is False:
f.write("<div class=\"article\" id=\"article-"+str(cpt)+"\" style=\"display: none;\">\n")
else:
f.write("<div class=\"article\" id=\"article-"+str(cpt)+"\" style=\"display: inline;\">\n")
f.write("<hr><a name=\"article-"+str(cpt)+"\">")
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">⇞</a></div>\n")
if pdf is False:
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">↥</a></div> \n")
f.write("<div id=\"nav-source\" style=\"display:inline;\"><a href=\""+article.link+"\" target=\"new-"+str(cpt)+"\">source</a></div> ")
if pdf is False:
f.write("<div id=\"nav-prev\" onclick=\"onArticle("+str(cpt_prev)+")\" style=\"display:inline;\">↤</div>\n")
f.write("<div id=\"nav-next\" onclick=\"onArticle("+str(cpt_next)+")\" style=\"display:inline;\">↦</div>\n")
else:
f.write("<div id=\"nav-prev\" style=\"display:inline;\"><a href=\"#article-"+str(cpt_prev)+"\">↤</a></div>\n")
f.write("<div id=\"nav-next\" style=\"display:inline;\"><a href=\"#article-"+str(cpt_next)+"\">↦</a></div>\n")
f.write("<div class=\"extract-content\" id=\""+str(cpt)+"\">\n")
f.write(article.content_only)
f.write("\n</div>\n")
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">⇞</a></div>\n")
if pdf is False:
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">↥</a></div> \n")
f.write("<div id=\"nav-source\" style=\"display:inline;\"><a href=\""+article.link+"\" target=\"new-"+str(cpt)+"\">source</a></div> ")
if pdf is False:
f.write("<div id=\"nav-prev\" onclick=\"onArticle("+str(cpt_prev)+")\" style=\"display:inline;\">↤</div>\n")
f.write("<div id=\"nav-next\" onclick=\"onArticle("+str(cpt_next)+")\" style=\"display:inline;\">↦</div>\n")
else:
f.write("<div id=\"nav-prev\" style=\"display:inline;\"><a href=\"#article-"+str(cpt_prev)+"\">↤</a></div>\n")
f.write("<div id=\"nav-next\" style=\"display:inline;\"><a href=\"#article-"+str(cpt_next)+"\">↦</a></div>\n")
f.write("</div>\n\n")
cpt=cpt+1
if cpt > MAX_ARTICLES:
break
f.close()
pdf = weasyprint.HTML(filename=output_filename).write_pdf()
file(output_filename+".pdf", 'w').write(pdf)