Showing 2 changed files with 20 additions and 14 deletions
+7 -4
README.md
... ...
@@ -24,7 +24,10 @@ The article content (i.e. Feed link) is fetch automatically and the content is e
24 24
 - **<article>...</article>**
25 25
 
26 26
 ## Examples
27
-- ./newsfetch.py --url http://www.vice.com/fr/rss
28
-- ./newsfetch.py --url http://www.lemonde.fr/rss/une.xml
29
-- ./newsfetch.py --url https://www.slate.fr/rss.xml
30
-- ./newsfetch.py --url http://www.lesinrocks.com/feeds/feed-a-la-une/
27
+```bash
28
+./newsfetch.py --url http://www.vice.com/fr/rss -o vice.fr.html
29
+./newsfetch.py --url http://www.lemonde.fr/rss/une.xml -o lemonde.html
30
+./newsfetch.py --url https://www.slate.fr/rss.xml -o slate.fr.html
31
+./newsfetch.py --url http://www.lesinrocks.com/feeds/feed-a-la-une/ -o lesinrocks.html
32
+./newsfetch.py --url http://www.numerama.com/rss/news.rss -o numerama.html
33
+```
+13 -10
newsfetch.py
... ...
@@ -11,6 +11,7 @@ import sys
11 11
 import getopt
12 12
 from slimmer import html_slimmer 
13 13
 
14
+MAX_ARTICLES=20
14 15
 verbose = False
15 16
 output_filename = 'default.html'
16 17
 rss_url = 'http://www.lemonde.fr/rss/une.xml'
... ...
@@ -30,7 +31,7 @@ for opt, arg in options:
30 31
     elif opt == '--version':
31 32
         version = arg
32 33
 
33
-CSS="h1,h2{font-weight:700}img,ul{width:440px;padding:0}em,img{text-align:left;align:left}#nav-next:hover,#nav-prev:hover,a:hover{background:#333}body{color:#000;font-family:Helvetica Neue,Helvetica,Arial,sans-serif;background-color:#f0f0f0}h1{font-size:1.5rem;line-height:1.5rem}h2,h3{font-size:1rem;line-height:1rem}details,h3{font-weight:400;font-style:italic}h3{background-color:#cdcdcd}details{font-family:TradeGothic,Helvetica Neue,Helvetica,Arial,sans-serif;font-size:.5rem}ul{list-style-type:none;color:#00F}ul:hover{cursor:pointer;cursor:hand}figure{margin-left:0;text-align:center}.img-heading,.img-nav{width:50px}#nav-next,#nav-prev,#nav-up{font-size:200%;font-weight:700;color:#00f}#nav-source{font-size:100%;font-weight:700;color:#00f}#article,#article-current{width:440px}.pullquote{padding:.5rem 1.5rem 0;font:700 1em/.8em TradeGothic,Helvetica Neue,Helvetica,Arial,sans-serif;position:relative;margin-bottom:1.5rem;z-index:1}.pullquote:after,.pullquote:before{color:silver;position:absolute;content:'"';font-size:5em;height:.5rem;line-height:.75em;top:0;left:-.07em;z-index:-1}.pullquote:after{content:'"';top:auto;bottom:0;left:auto;right:0;line-height:.36em}a{text-decoration:none}a:link,a:visited{color:#00F}"
34
+CSS="h1,h2{font-weight:700}img,ul{width:440px;padding:0}em,img{text-align:left;align:left}img{height:208px}#nav-next:hover,#nav-prev:hover,a:hover{background:#333}body{color:#000;font-family:Helvetica Neue,Helvetica,Arial,sans-serif;background-color:#f0f0f0}h1{font-size:1.5rem;line-height:1.5rem}h2,h3{font-size:1rem;line-height:1rem}details,h3{font-weight:400;font-style:italic}h3{background-color:#cdcdcd}details{font-family:TradeGothic,Helvetica Neue,Helvetica,Arial,sans-serif;font-size:.5rem}ul{list-style-type:none;color:#00F}ul:hover{cursor:pointer;cursor:hand}figure{margin-left:0;text-align:center}.img-heading,.img-nav{width:50px}#nav-next,#nav-prev,#nav-up{font-size:200%;font-weight:700;color:#00f}#nav-source{font-size:100%;font-weight:700;color:#00f}#article,#article-current{width:440px}.pullquote{padding:.5rem 1.5rem 0;font:700 1em/.8em TradeGothic,Helvetica Neue,Helvetica,Arial,sans-serif;position:relative;margin-bottom:1.5rem;z-index:1}.pullquote:after,.pullquote:before{color:silver;position:absolute;content:'"';font-size:5em;height:.5rem;line-height:.75em;top:0;left:-.07em;z-index:-1}.pullquote:after{content:'"';top:auto;bottom:0;left:auto;right:0;line-height:.36em}a{text-decoration:none}a:link,a:visited{color:#00F}"
34 35
 
35 36
 class Printable:
36 37
     def __repr__(self):
... ...
@@ -113,10 +114,12 @@ for article in d.entries:
113 114
   for link in article.links:
114 115
     if "enclosure" == link.rel:
115 116
       article_details.enclosure = link.href
117
+      
118
+  if article_details.enclosure == "":
119
+    soup_mysite = BeautifulSoup(article.description,"lxml")
120
+    content = soup_mysite.find("img")
121
+    article_details.enclosure = content.get('src')
116 122
   
117
-  #~ article_details.content = article.content.value.encode('utf-8').strip()
118
-  #~ print len(article_details.content)
119
-
120 123
   # Not Working as is. Generated image is too big
121 124
   #if article_details.enclosure is not None:
122 125
     #img_content=urllib2.urlopen(article.link).read()
... ...
@@ -128,6 +131,8 @@ for article in d.entries:
128 131
   f.write("\t"+article_details.title+"</div></ul>\n")
129 132
   articles.append(article_details)
130 133
   cpt=cpt+1
134
+  if cpt > MAX_ARTICLES:
135
+    break
131 136
 
132 137
 cpt_num=cpt
133 138
 f.write("\n<a name=\"article-top\"></a>\n")
... ...
@@ -135,7 +140,6 @@ f.write("<div id=\"article-current\"></div>\n\n")
135 140
 cpt=0
136 141
 for article in articles:
137 142
   print("-- {:d} : {:s}".format(cpt,article.title))
138
-  #~ print("  -- {:s}".format(article.link))
139 143
   opener = urllib2.build_opener()
140 144
   opener.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0')]
141 145
   try:
... ...
@@ -177,14 +181,11 @@ for article in articles:
177 181
     content = soup_mysite.find('div', attrs={'class':'article-page'})
178 182
   
179 183
   if content == None:
180
-    #~ content = soup_mysite.find('div', attrs={'id':'block-article'})
181 184
     only_text=soup_mysite.find('div', attrs={'id':'the-content'})
182 185
     content = "<h1>{:s}</h1><h3>{:s}</h3>{:s}".format(article.title,article.summary,only_text)
183
-    #~ content += soup_mysite.find('div', attrs={'id':'the-content'})
184
-    #~ content = soup_mysite.find('div', attrs={'class':'article-top'})
185
-    #~ content = soup_mysite.find('div', attrs={'class':'inner clearfix'})
186 186
     
187
-    #~ <div id="block-article" class="article" itemscope itemtype="Article">
187
+  if rss_url == "http://www.numerama.com/rss/news.rss":
188
+    content = "<h1>{:s}</h1>{:s}".format(article.title,content)
188 189
     
189 190
   article.content_only = str(content)
190 191
   article.content_only = article.content_only.replace(" href=\"/", " href=\"http://www.lemonde.fr/")
... ...
@@ -267,5 +268,7 @@ for article in articles:
267 268
   f.write("<div id=\"nav-next\" onclick=\"onArticle("+str(cpt_next)+")\" style=\"display:inline;\">&#8614;</div>\n")
268 269
   f.write("</div>\n\n")
269 270
   cpt=cpt+1
271
+  if cpt > MAX_ARTICLES:
272
+    break
270 273
   
271 274
 f.close()