Showing 2 changed files with 80 additions and 24 deletions
+21
README.md
... ...
@@ -1,3 +1,24 @@
1 1
 # newsfetch.py
2 2
 
3 3
 Python scrapper to make 1 big HTML from NewsPaper RSS
4
+
5
+## Dependencies
6
+
7
+```bash
8
+sudo apt-get install libxml2-dev libxslt-dev
9
+sudo pip install bs4 feedparser lxml slimmer
10
+```
11
+
12
+## Usage
13
+```bash
14
+newsfetch.py -u <rss url> -o <output filename>
15
+```
16
+
17
+## Default Parameters
18
+- url : **http://www.lemonde.fr/rss/une.xml**
19
+- output : **default.html**
20
+
21
+## How it works
22
+The feed is parsed and a list of available article is created.
23
+The article content (i.e. Feed link) is fetch automatically and the content is extracted :
24
+- **&lt;article&gt;...&lt;/article&gt;**
+59 -24
newsfetch.py
... ...
@@ -6,16 +6,19 @@ import base64
6 6
 import pprint
7 7
 import urllib2
8 8
 import lxml.html
9
+import re
9 10
 import sys
10 11
 import getopt
12
+from slimmer import html_slimmer 
11 13
 
12 14
 verbose = False
13 15
 output_filename = 'default.html'
14
-rss_url = 'rss_url'
16
+rss_url = 'http://www.lemonde.fr/rss/une.xml'
15 17
 
16 18
 
17 19
 options, remainder = getopt.getopt(sys.argv[1:], 'o:v', ['output=', 
18 20
                                                          'verbose',
21
+                                                         'url=',
19 22
                                                          ])
20 23
 for opt, arg in options:
21 24
     if opt in ('-o', '--output'):
... ...
@@ -57,15 +60,19 @@ class ArticleDetails(Printable):
57 60
     print("ArticleDetails:link      :{:s}".format(self.link))
58 61
     print("ArticleDetails:summary   :{:s}".format(self.summary))
59 62
     print("ArticleDetails:enclosure :{:s}".format(self.enclosure))
60
-    #~ print("ArticleDetails:content   :{:s}".format(self.content))    
61 63
     print("ArticleDetails:content_only :{:s}".format(self.content))    
62 64
     print("ArticleDetails:content   : ================")
63 65
 
66
+
67
+def process_match(m):
68
+    # Process the match here.
69
+    return ''
70
+    
64 71
 debug=1
65 72
 
66 73
 feed_details=FeedDetails()
67
-
68
-d = feedparser.parse('http://www.lemonde.fr/rss/une.xml')
74
+print rss_url
75
+d = feedparser.parse(rss_url)
69 76
 feed_details.title=d['feed']['title'].encode('utf-8').strip()
70 77
 feed_details.link=d['feed']['link'].encode('utf-8').strip()
71 78
 feed_details.subtitle=d['feed']['subtitle'].encode('utf-8').strip()
... ...
@@ -84,20 +91,20 @@ f.write("	<meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"
84 91
 f.write("	<meta name=\"viewport\" content=\"width=450px, user-scalable=no\">\n")
85 92
 f.write("	<link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\" />\n")
86 93
 f.write("	<link rel=\"icon\" type=\"image/ico\" href=\"favicon.ico\">\n")
87
-f.write("	<!-- Touch Screen Detection -->\n")
88
-f.write("	<script>\n")
89
-f.write("	function isTouchDevice(){\n")
90
-f.write("		return true == (\"ontouchstart\" in window || window.DocumentTouch && document instanceof DocumentTouch);\n")
91
-f.write("	}\n")
92
-f.write("	</script>\n")
93
-f.write("	<script type=\"text/javascript\">\n")
94
-f.write("	/* Hack for Mobile */\n")
95
-f.write("	if(isTouchDevice()===true) {\n")
96
-f.write("		document.getElementById(\"img\").style.width = 400px;\n")
97
-f.write("		document.getElementById(\"extract-content\").style.width = 400px;\n")
98
-f.write("		document.getElementById(\"article-current\").style.width = 440px;\n")
99
-f.write("	}\n")
100
-f.write("	</script>\n")
94
+#~ f.write("	<!-- Touch Screen Detection -->\n")
95
+#~ f.write("	<script>\n")
96
+#~ f.write("	function isTouchDevice(){\n")
97
+#~ f.write("		return true == (\"ontouchstart\" in window || window.DocumentTouch && document instanceof DocumentTouch);\n")
98
+#~ f.write("	}\n")
99
+#~ f.write("	</script>\n")
100
+#~ f.write("	<script type=\"text/javascript\">\n")
101
+#~ f.write("	/* Hack for Mobile */\n")
102
+#~ f.write("	if(isTouchDevice()===true) {\n")
103
+#~ f.write("		document.getElementById(\"img\").style.width = 400px;\n")
104
+#~ f.write("		document.getElementById(\"extract-content\").style.width = 400px;\n")
105
+#~ f.write("		document.getElementById(\"article-current\").style.width = 440px;\n")
106
+#~ f.write("	}\n")
107
+#~ f.write("	</script>\n")
101 108
 f.write("	<script>\n")
102 109
 f.write("	function onArticle(index) {\n")
103 110
 f.write("		var string_index = \"article-\"+index;\n")
... ...
@@ -128,7 +135,7 @@ for article in d.entries:
128 135
   #if article_details.enclosure is not None:
129 136
     #img_content=urllib2.urlopen(article.link).read()
130 137
     #article_details.enclosure = "data:image/jpg;base64,"+base64.b64encode(img_content)
131
-  f.write("\t<ul><div onclick=\"onArticle("+str(cpt)+")\" style=\"display:inline;\">\n")  
138
+  f.write("<ul><div onclick=\"onArticle("+str(cpt)+")\" style=\"display:inline;\">\n")  
132 139
   f.write("\t<img src=\""+article_details.enclosure+"\" style=\"display:inline;\"><br>\n")
133 140
   f.write("\t<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">&#8670;</a></div>\n")
134 141
   f.write("\t<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">&#8615;</a></div>\n")
... ...
@@ -146,24 +153,52 @@ for article in articles:
146 153
   article.content = response.read()
147 154
   soup_mysite = BeautifulSoup(article.content,"lxml")
148 155
   content = soup_mysite.find("article")
149
-  article.content_only = content
156
+  article.content_only = str(content)
157
+  article.content_only = article.content_only.replace(" href=\"/", " href=\"http://www.lemonde.fr/")
158
+  article.content_only = article.content_only.replace('<script>require(["twitter/widgets"]);</script>','')
159
+  article.content_only = article.content_only.replace('<script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script></div>','')
160
+  article.content_only = article.content_only.replace('<div class="toolbar"></div>','')
161
+  article.content_only = article.content_only.replace('<figure class="illustration_haut   " style="width: 534px">','<figure>')
162
+  article.content_only = article.content_only.replace('<figure class="illustration_haut">','<figure>')
163
+  article.content_only = article.content_only.replace('<span id="publisher" itemprop="Publisher" data-source="LE MONDE">Le Monde</span>','Le Monde')
164
+  article.content_only = article.content_only.replace(' data-lazyload="false" ',' ')
165
+  article.content_only = article.content_only.replace('Par<span>','Par <span>')
166
+  article.content_only = article.content_only.replace('<span class="source_image" ','<span ')
167
+  article.content_only = article.content_only.replace('<div class="bg_gris_moyen signature">','<div>')
168
+  article.content_only = article.content_only.replace('<p itemprop="author" class="auteur txt2_120">','<p>')
169
+  article.content_only = article.content_only.replace('<li class="clearfix" itemprop="author">','<li>')
170
+  article.content_only = article.content_only.replace('<img width="534"','<img width="400"')
171
+  article.content_only = article.content_only.replace('onclick="return false;" ','')
172
+  article.content_only = article.content_only.replace('target="_blank"','')
173
+  article.content_only = article.content_only.replace('class="lien_interne rub"','')
174
+  regexConjug = re.compile(r'<a class=\"lien_interne conjug\".*?>(.+?)</a>')
175
+  article.content_only = regexConjug.sub('\\1',article.content_only)
176
+  #~ article.content_only = article.content_only.replace('','')
177
+  #~ article.content_only = article.content_only.replace('','')
178
+  #~ article.content_only = article.content_only.replace('','')
179
+  #~ article.content_only = article.content_only.replace('','')
180
+  #~ article.content_only = article.content_only.replace('','')
181
+  #~ article.content_only = html_slimmer(article.content_only)
182
+  article.content_only = html_slimmer(article.content_only.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
183
+  
184
+
185
+
150 186
   cpt_prev=cpt-1
151 187
   if cpt_prev < 0:
152 188
     cpt_prev = 0
153 189
   cpt_next=cpt+1
154 190
   if cpt_next > cpt_num:
155 191
     cpt_next = cpt_num
156
-  
157
-  f.write("<!-- ==================== article "+str(cpt)+" ============== -->\n")
192
+
158 193
   f.write("<div class=\"article\" id=\"article-"+str(cpt)+"\" style=\"display: none;\">\n")
159
-  f.write("<hr>\n<a name=\"article-"+str(cpt)+"\">\n")
194
+  f.write("<hr><a name=\"article-"+str(cpt)+"\">")
160 195
   f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">&#8670;</a></div>\n")
161 196
   f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">&#8613;</a></div>&nbsp;\n")
162 197
   f.write("<div id=\"nav-source\" style=\"display:inline;\"><a href=\""+article.link+"\" target=\"new-"+str(cpt)+"\">source</a></div>&nbsp;")
163 198
   f.write("<div id=\"nav-prev\" onclick=\"onArticle("+str(cpt_prev)+")\" style=\"display:inline;\">&#8612;</div>\n")
164 199
   f.write("<div id=\"nav-next\" onclick=\"onArticle("+str(cpt_next)+")\" style=\"display:inline;\">&#8614;</div>\n")
165 200
   f.write("<div class=\"extract-content\" id=\""+str(cpt)+"\">\n")
166
-  f.write(str(content))
201
+  f.write(article.content_only)
167 202
   f.write("\n</div>\n")
168 203
   f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">&#8670;</a></div>\n")
169 204
   f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">&#8613;</a></div>&nbsp;\n")