...
|
...
|
@@ -6,16 +6,19 @@ import base64
|
6
|
6
|
import pprint
|
7
|
7
|
import urllib2
|
8
|
8
|
import lxml.html
|
|
9
|
+import re
|
9
|
10
|
import sys
|
10
|
11
|
import getopt
|
|
12
|
+from slimmer import html_slimmer
|
11
|
13
|
|
12
|
14
|
verbose = False
|
13
|
15
|
output_filename = 'default.html'
|
14
|
|
-rss_url = 'rss_url'
|
|
16
|
+rss_url = 'http://www.lemonde.fr/rss/une.xml'
|
15
|
17
|
|
16
|
18
|
|
17
|
19
|
options, remainder = getopt.getopt(sys.argv[1:], 'o:v', ['output=',
|
18
|
20
|
'verbose',
|
|
21
|
+ 'url=',
|
19
|
22
|
])
|
20
|
23
|
for opt, arg in options:
|
21
|
24
|
if opt in ('-o', '--output'):
|
...
|
...
|
@@ -57,15 +60,19 @@ class ArticleDetails(Printable):
|
57
|
60
|
print("ArticleDetails:link :{:s}".format(self.link))
|
58
|
61
|
print("ArticleDetails:summary :{:s}".format(self.summary))
|
59
|
62
|
print("ArticleDetails:enclosure :{:s}".format(self.enclosure))
|
60
|
|
- #~ print("ArticleDetails:content :{:s}".format(self.content))
|
61
|
63
|
print("ArticleDetails:content_only :{:s}".format(self.content))
|
62
|
64
|
print("ArticleDetails:content : ================")
|
63
|
65
|
|
|
66
|
+
|
|
67
|
+def process_match(m):
|
|
68
|
+ # Process the match here.
|
|
69
|
+ return ''
|
|
70
|
+
|
64
|
71
|
debug=1
|
65
|
72
|
|
66
|
73
|
feed_details=FeedDetails()
|
67
|
|
-
|
68
|
|
-d = feedparser.parse('http://www.lemonde.fr/rss/une.xml')
|
|
74
|
+print rss_url
|
|
75
|
+d = feedparser.parse(rss_url)
|
69
|
76
|
feed_details.title=d['feed']['title'].encode('utf-8').strip()
|
70
|
77
|
feed_details.link=d['feed']['link'].encode('utf-8').strip()
|
71
|
78
|
feed_details.subtitle=d['feed']['subtitle'].encode('utf-8').strip()
|
...
|
...
|
@@ -84,20 +91,20 @@ f.write(" <meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"
|
84
|
91
|
f.write(" <meta name=\"viewport\" content=\"width=450px, user-scalable=no\">\n")
|
85
|
92
|
f.write(" <link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\" />\n")
|
86
|
93
|
f.write(" <link rel=\"icon\" type=\"image/ico\" href=\"favicon.ico\">\n")
|
87
|
|
-f.write(" <!-- Touch Screen Detection -->\n")
|
88
|
|
-f.write(" <script>\n")
|
89
|
|
-f.write(" function isTouchDevice(){\n")
|
90
|
|
-f.write(" return true == (\"ontouchstart\" in window || window.DocumentTouch && document instanceof DocumentTouch);\n")
|
91
|
|
-f.write(" }\n")
|
92
|
|
-f.write(" </script>\n")
|
93
|
|
-f.write(" <script type=\"text/javascript\">\n")
|
94
|
|
-f.write(" /* Hack for Mobile */\n")
|
95
|
|
-f.write(" if(isTouchDevice()===true) {\n")
|
96
|
|
-f.write(" document.getElementById(\"img\").style.width = 400px;\n")
|
97
|
|
-f.write(" document.getElementById(\"extract-content\").style.width = 400px;\n")
|
98
|
|
-f.write(" document.getElementById(\"article-current\").style.width = 440px;\n")
|
99
|
|
-f.write(" }\n")
|
100
|
|
-f.write(" </script>\n")
|
|
94
|
+#~ f.write(" <!-- Touch Screen Detection -->\n")
|
|
95
|
+#~ f.write(" <script>\n")
|
|
96
|
+#~ f.write(" function isTouchDevice(){\n")
|
|
97
|
+#~ f.write(" return true == (\"ontouchstart\" in window || window.DocumentTouch && document instanceof DocumentTouch);\n")
|
|
98
|
+#~ f.write(" }\n")
|
|
99
|
+#~ f.write(" </script>\n")
|
|
100
|
+#~ f.write(" <script type=\"text/javascript\">\n")
|
|
101
|
+#~ f.write(" /* Hack for Mobile */\n")
|
|
102
|
+#~ f.write(" if(isTouchDevice()===true) {\n")
|
|
103
|
+#~ f.write(" document.getElementById(\"img\").style.width = 400px;\n")
|
|
104
|
+#~ f.write(" document.getElementById(\"extract-content\").style.width = 400px;\n")
|
|
105
|
+#~ f.write(" document.getElementById(\"article-current\").style.width = 440px;\n")
|
|
106
|
+#~ f.write(" }\n")
|
|
107
|
+#~ f.write(" </script>\n")
|
101
|
108
|
f.write(" <script>\n")
|
102
|
109
|
f.write(" function onArticle(index) {\n")
|
103
|
110
|
f.write(" var string_index = \"article-\"+index;\n")
|
...
|
...
|
@@ -128,7 +135,7 @@ for article in d.entries:
|
128
|
135
|
#if article_details.enclosure is not None:
|
129
|
136
|
#img_content=urllib2.urlopen(article.link).read()
|
130
|
137
|
#article_details.enclosure = "data:image/jpg;base64,"+base64.b64encode(img_content)
|
131
|
|
- f.write("\t<ul><div onclick=\"onArticle("+str(cpt)+")\" style=\"display:inline;\">\n")
|
|
138
|
+ f.write("<ul><div onclick=\"onArticle("+str(cpt)+")\" style=\"display:inline;\">\n")
|
132
|
139
|
f.write("\t<img src=\""+article_details.enclosure+"\" style=\"display:inline;\"><br>\n")
|
133
|
140
|
f.write("\t<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">⇞</a></div>\n")
|
134
|
141
|
f.write("\t<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">↧</a></div>\n")
|
...
|
...
|
@@ -146,24 +153,52 @@ for article in articles:
|
146
|
153
|
article.content = response.read()
|
147
|
154
|
soup_mysite = BeautifulSoup(article.content,"lxml")
|
148
|
155
|
content = soup_mysite.find("article")
|
149
|
|
- article.content_only = content
|
|
156
|
+ article.content_only = str(content)
|
|
157
|
+ article.content_only = article.content_only.replace(" href=\"/", " href=\"http://www.lemonde.fr/")
|
|
158
|
+ article.content_only = article.content_only.replace('<script>require(["twitter/widgets"]);</script>','')
|
|
159
|
+ article.content_only = article.content_only.replace('<script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script></div>','')
|
|
160
|
+ article.content_only = article.content_only.replace('<div class="toolbar"></div>','')
|
|
161
|
+ article.content_only = article.content_only.replace('<figure class="illustration_haut " style="width: 534px">','<figure>')
|
|
162
|
+ article.content_only = article.content_only.replace('<figure class="illustration_haut">','<figure>')
|
|
163
|
+ article.content_only = article.content_only.replace('<span id="publisher" itemprop="Publisher" data-source="LE MONDE">Le Monde</span>','Le Monde')
|
|
164
|
+ article.content_only = article.content_only.replace(' data-lazyload="false" ',' ')
|
|
165
|
+ article.content_only = article.content_only.replace('Par<span>','Par <span>')
|
|
166
|
+ article.content_only = article.content_only.replace('<span class="source_image" ','<span ')
|
|
167
|
+ article.content_only = article.content_only.replace('<div class="bg_gris_moyen signature">','<div>')
|
|
168
|
+ article.content_only = article.content_only.replace('<p itemprop="author" class="auteur txt2_120">','<p>')
|
|
169
|
+ article.content_only = article.content_only.replace('<li class="clearfix" itemprop="author">','<li>')
|
|
170
|
+ article.content_only = article.content_only.replace('<img width="534"','<img width="400"')
|
|
171
|
+ article.content_only = article.content_only.replace('onclick="return false;" ','')
|
|
172
|
+ article.content_only = article.content_only.replace('target="_blank"','')
|
|
173
|
+ article.content_only = article.content_only.replace('class="lien_interne rub"','')
|
|
174
|
+ regexConjug = re.compile(r'<a class=\"lien_interne conjug\".*?>(.+?)</a>')
|
|
175
|
+ article.content_only = regexConjug.sub('\\1',article.content_only)
|
|
176
|
+ #~ article.content_only = article.content_only.replace('','')
|
|
177
|
+ #~ article.content_only = article.content_only.replace('','')
|
|
178
|
+ #~ article.content_only = article.content_only.replace('','')
|
|
179
|
+ #~ article.content_only = article.content_only.replace('','')
|
|
180
|
+ #~ article.content_only = article.content_only.replace('','')
|
|
181
|
+ #~ article.content_only = html_slimmer(article.content_only)
|
|
182
|
+ article.content_only = html_slimmer(article.content_only.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
|
150
|
186
|
cpt_prev=cpt-1
|
151
|
187
|
if cpt_prev < 0:
|
152
|
188
|
cpt_prev = 0
|
153
|
189
|
cpt_next=cpt+1
|
154
|
190
|
if cpt_next > cpt_num:
|
155
|
191
|
cpt_next = cpt_num
|
156
|
|
-
|
157
|
|
- f.write("<!-- ==================== article "+str(cpt)+" ============== -->\n")
|
|
192
|
+
|
158
|
193
|
f.write("<div class=\"article\" id=\"article-"+str(cpt)+"\" style=\"display: none;\">\n")
|
159
|
|
- f.write("<hr>\n<a name=\"article-"+str(cpt)+"\">\n")
|
|
194
|
+ f.write("<hr><a name=\"article-"+str(cpt)+"\">")
|
160
|
195
|
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">⇞</a></div>\n")
|
161
|
196
|
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">↥</a></div> \n")
|
162
|
197
|
f.write("<div id=\"nav-source\" style=\"display:inline;\"><a href=\""+article.link+"\" target=\"new-"+str(cpt)+"\">source</a></div> ")
|
163
|
198
|
f.write("<div id=\"nav-prev\" onclick=\"onArticle("+str(cpt_prev)+")\" style=\"display:inline;\">↤</div>\n")
|
164
|
199
|
f.write("<div id=\"nav-next\" onclick=\"onArticle("+str(cpt_next)+")\" style=\"display:inline;\">↦</div>\n")
|
165
|
200
|
f.write("<div class=\"extract-content\" id=\""+str(cpt)+"\">\n")
|
166
|
|
- f.write(str(content))
|
|
201
|
+ f.write(article.content_only)
|
167
|
202
|
f.write("\n</div>\n")
|
168
|
203
|
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#top\">⇞</a></div>\n")
|
169
|
204
|
f.write("<div id=\"nav-up\" style=\"display:inline;\"><a href=\"#article-top\">↥</a></div> \n")
|