Showing 2 changed files with 48 additions and 6 deletions
+5
README.md
... ...
@@ -22,3 +22,8 @@ newsfetch.py -u <rss url> -o <output filename>
22 22
 The feed is parsed and a list of available article is created.
23 23
 The article content (i.e. Feed link) is fetch automatically and the content is extracted :
24 24
 - **&lt;article&gt;...&lt;/article&gt;**
25
+
26
+## Examples
27
+- ./newsfetch.py --url http://www.vice.com/fr/rss
28
+- ./newsfetch.py --url http://www.lemonde.fr/rss/une.xml
29
+- ./newsfetch.py --url https://www.slate.fr/rss.xml
+43 -6
newsfetch.py
... ...
@@ -103,6 +103,7 @@ f.write("<h1 id=\"top\">"+feed_details.title+"</h1>\n")
103 103
 
104 104
 articles=list()
105 105
 cpt=0
106
+
106 107
 for article in d.entries:
107 108
   article_details = ArticleDetails()
108 109
   article_details.title = article.title.encode('utf-8').strip()
... ...
@@ -112,8 +113,11 @@ for article in d.entries:
112 113
   for link in article.links:
113 114
     if "enclosure" == link.rel:
114 115
       article_details.enclosure = link.href
116
+  
117
+  #~ article_details.content = article.content.value.encode('utf-8').strip()
118
+  #~ print len(article_details.content)
115 119
 
116
-  # Not Wroking as is. Generated image is too big
120
+  # Not Working as is. Generated image is too big
117 121
   #if article_details.enclosure is not None:
118 122
     #img_content=urllib2.urlopen(article.link).read()
119 123
     #article_details.enclosure = "data:image/jpg;base64,"+base64.b64encode(img_content)
... ...
@@ -131,13 +135,47 @@ f.write("<div id=\"article-current\"></div>\n\n")
131 135
 cpt=0
132 136
 for article in articles:
133 137
   print("-- {:d} : {:s}".format(cpt,article.title))
134
-  response = urllib2.urlopen(article.link)
135
-  article.content = response.read()
138
+  #~ print("  -- {:s}".format(article.link))
139
+  opener = urllib2.build_opener()
140
+  opener.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0')]
141
+  try:
142
+    response = opener.open(article.link)
143
+    article.content = response.read()
144
+  except:
145
+    print("  -- {:d}".format(response.code))
146
+    article.content = None
147
+        
148
+  if None == article.content:
149
+    cpt_prev=cpt-1
150
+    if cpt_prev < 0:
151
+      cpt_prev = 0
152
+    cpt_next=cpt+1
153
+    if cpt_next > cpt_num:
154
+      cpt_next = cpt_num
155
+    f.write("<div class=\"article\" id=\"article-"+str(cpt)+"\" style=\"display: none;\">\n")
156
+    f.write("<hr><a name=\"article-"+str(cpt)+"\">")
157
+    f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">&#8670;</a></div>\n")
158
+    f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">&#8613;</a></div>&nbsp;\n")
159
+    f.write("<div id=\"nav-source\" style=\"display:inline;\"><a href=\""+article.link+"\" target=\"new-"+str(cpt)+"\">source</a></div>&nbsp;")
160
+    f.write("<div id=\"nav-prev\" onclick=\"onArticle("+str(cpt_prev)+")\" style=\"display:inline;\">&#8612;</div>\n")
161
+    f.write("<div id=\"nav-next\" onclick=\"onArticle("+str(cpt_next)+")\" style=\"display:inline;\">&#8614;</div>\n")
162
+    f.write("<div class=\"extract-content\" id=\""+str(cpt)+"\">\n")
163
+    f.write(article.content_only)
164
+    f.write("\n</div>\n")
165
+    f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">&#8670;</a></div>\n")
166
+    f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">&#8613;</a></div>&nbsp;\n")
167
+    f.write("<div id=\"nav-source\" style=\"display:inline;\"><a href=\""+article.link+"\" target=\"new-"+str(cpt)+"\">source</a></div>&nbsp;")
168
+    f.write("<div id=\"nav-prev\" onclick=\"onArticle("+str(cpt_prev)+")\" style=\"display:inline;\">&#8612;</div>\n")
169
+    f.write("<div id=\"nav-next\" onclick=\"onArticle("+str(cpt_next)+")\" style=\"display:inline;\">&#8614;</div>\n")
170
+    f.write("</div>\n\n")
171
+    cpt=cpt+1
172
+    continue
173
+    
136 174
   soup_mysite = BeautifulSoup(article.content,"lxml")
137 175
   content = soup_mysite.find("article")
138 176
   if content == None:
139 177
     content = soup_mysite.find('div', attrs={'class':'article-page'})
140
-
178
+    
141 179
   article.content_only = str(content)
142 180
   article.content_only = article.content_only.replace(" href=\"/", " href=\"http://www.lemonde.fr/")
143 181
   article.content_only = article.content_only.replace('<script>require(["twitter/widgets"]);</script>','')
... ...
@@ -193,8 +231,7 @@ for article in articles:
193 231
   article.content_only = regexConjug.sub('\\1',article.content_only)
194 232
 
195 233
   # Diet
196
-  #~ article.content_only = html_slimmer(article.content_only.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
197
-  #~ article.content_only = article.content_only.encode("utf-8")
234
+  article.content_only = html_slimmer(article.content_only.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
198 235
   
199 236
   cpt_prev=cpt-1
200 237
   if cpt_prev < 0: