...
|
...
|
@@ -15,11 +15,11 @@ verbose = False
|
15
|
15
|
output_filename = 'default.html'
|
16
|
16
|
rss_url = 'http://www.lemonde.fr/rss/une.xml'
|
17
|
17
|
|
18
|
|
-
|
19
|
18
|
options, remainder = getopt.getopt(sys.argv[1:], 'o:v', ['output=',
|
20
|
19
|
'verbose',
|
21
|
20
|
'url=',
|
22
|
21
|
])
|
|
22
|
+
|
23
|
23
|
for opt, arg in options:
|
24
|
24
|
if opt in ('-o', '--output'):
|
25
|
25
|
output_filename = arg
|
...
|
...
|
@@ -78,10 +78,6 @@ feed_details.link=d['feed']['link'].encode('utf-8').strip()
|
78
|
78
|
feed_details.subtitle=d['feed']['subtitle'].encode('utf-8').strip()
|
79
|
79
|
feed_details.num = len(d['entries'])
|
80
|
80
|
|
81
|
|
-
|
82
|
|
-#~ if 1 == debug:
|
83
|
|
- #~ feed_details.debug_print()
|
84
|
|
-
|
85
|
81
|
f = open(output_filename, 'w')
|
86
|
82
|
f.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd\">\n")
|
87
|
83
|
f.write("<html>\n")
|
...
|
...
|
@@ -91,20 +87,6 @@ f.write(" <meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"
|
91
|
87
|
f.write(" <meta name=\"viewport\" content=\"width=450px, user-scalable=no\">\n")
|
92
|
88
|
f.write(" <link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\" />\n")
|
93
|
89
|
f.write(" <link rel=\"icon\" type=\"image/ico\" href=\"favicon.ico\">\n")
|
94
|
|
-#~ f.write(" <!-- Touch Screen Detection -->\n")
|
95
|
|
-#~ f.write(" <script>\n")
|
96
|
|
-#~ f.write(" function isTouchDevice(){\n")
|
97
|
|
-#~ f.write(" return true == (\"ontouchstart\" in window || window.DocumentTouch && document instanceof DocumentTouch);\n")
|
98
|
|
-#~ f.write(" }\n")
|
99
|
|
-#~ f.write(" </script>\n")
|
100
|
|
-#~ f.write(" <script type=\"text/javascript\">\n")
|
101
|
|
-#~ f.write(" /* Hack for Mobile */\n")
|
102
|
|
-#~ f.write(" if(isTouchDevice()===true) {\n")
|
103
|
|
-#~ f.write(" document.getElementById(\"img\").style.width = 400px;\n")
|
104
|
|
-#~ f.write(" document.getElementById(\"extract-content\").style.width = 400px;\n")
|
105
|
|
-#~ f.write(" document.getElementById(\"article-current\").style.width = 440px;\n")
|
106
|
|
-#~ f.write(" }\n")
|
107
|
|
-#~ f.write(" </script>\n")
|
108
|
90
|
f.write(" <script>\n")
|
109
|
91
|
f.write(" function onArticle(index) {\n")
|
110
|
92
|
f.write(" var string_index = \"article-\"+index;\n")
|
...
|
...
|
@@ -131,7 +113,7 @@ for article in d.entries:
|
131
|
113
|
if "enclosure" == link.rel:
|
132
|
114
|
article_details.enclosure = link.href
|
133
|
115
|
|
134
|
|
- # Npot Wroking as is. Generated image is too big
|
|
116
|
+ # Not Wroking as is. Generated image is too big
|
135
|
117
|
#if article_details.enclosure is not None:
|
136
|
118
|
#img_content=urllib2.urlopen(article.link).read()
|
137
|
119
|
#article_details.enclosure = "data:image/jpg;base64,"+base64.b64encode(img_content)
|
...
|
...
|
@@ -153,6 +135,9 @@ for article in articles:
|
153
|
135
|
article.content = response.read()
|
154
|
136
|
soup_mysite = BeautifulSoup(article.content,"lxml")
|
155
|
137
|
content = soup_mysite.find("article")
|
|
138
|
+ if content == None:
|
|
139
|
+ content = soup_mysite.find('div', attrs={'class':'article-page'})
|
|
140
|
+
|
156
|
141
|
article.content_only = str(content)
|
157
|
142
|
article.content_only = article.content_only.replace(" href=\"/", " href=\"http://www.lemonde.fr/")
|
158
|
143
|
article.content_only = article.content_only.replace('<script>require(["twitter/widgets"]);</script>','')
|
...
|
...
|
@@ -160,6 +145,7 @@ for article in articles:
|
160
|
145
|
article.content_only = article.content_only.replace('<div class="toolbar"></div>','')
|
161
|
146
|
article.content_only = article.content_only.replace('<figure class="illustration_haut " style="width: 534px">','<figure>')
|
162
|
147
|
article.content_only = article.content_only.replace('<figure class="illustration_haut">','<figure>')
|
|
148
|
+ article.content_only = article.content_only.replace('<figure class="illustration_haut " style="width: 534px">','<figure>')
|
163
|
149
|
article.content_only = article.content_only.replace('<span id="publisher" itemprop="Publisher" data-source="LE MONDE">Le Monde</span>','Le Monde')
|
164
|
150
|
article.content_only = article.content_only.replace(' data-lazyload="false" ',' ')
|
165
|
151
|
article.content_only = article.content_only.replace('Par<span>','Par <span>')
|
...
|
...
|
@@ -171,18 +157,45 @@ for article in articles:
|
171
|
157
|
article.content_only = article.content_only.replace('onclick="return false;" ','')
|
172
|
158
|
article.content_only = article.content_only.replace('target="_blank"','')
|
173
|
159
|
article.content_only = article.content_only.replace('class="lien_interne rub"','')
|
174
|
|
- regexConjug = re.compile(r'<a class=\"lien_interne conjug\".*?>(.+?)</a>')
|
175
|
|
- article.content_only = regexConjug.sub('\\1',article.content_only)
|
|
160
|
+ article.content_only = article.content_only.replace('<a class="auteur" target="_blank" ','<a target="_blank" ')
|
|
161
|
+ article.content_only = article.content_only.replace('<a class="lien_interne rub" ','<a target="_blank" ')
|
|
162
|
+ article.content_only = article.content_only.replace('<a target=\'_blank\' onclick=\'return false;\' class=\'lien_interne conjug\' ','<a target="_blank" ')
|
|
163
|
+ article.content_only = article.content_only.replace('<h1 class="tt2" itemprop="Headline">','<h1>')
|
|
164
|
+ article.content_only = article.content_only.replace('<h2 class="taille_courante">','<h2>')
|
|
165
|
+ article.content_only = article.content_only.replace('<h2 class="intertitre">','<h2>')
|
|
166
|
+ article.content_only = article.content_only.replace('<h2 class="taille_courante">','<h2>')
|
|
167
|
+ article.content_only = article.content_only.replace('<p class="lire">','<p>')
|
|
168
|
+ article.content_only = article.content_only.replace('<p class="lire marqueur_restreint_atome">','<p>')
|
|
169
|
+ article.content_only = article.content_only.replace('<p class="bloc_signature">','<p>')
|
|
170
|
+ article.content_only = article.content_only.replace('<strong class="txt1 txt_gris_moyen">Suivre</strong>','')
|
|
171
|
+ article.content_only = article.content_only.replace('<br><span class="txt_gris_clair">Journaliste au Monde</span>','Journaliste au Monde')
|
|
172
|
+ article.content_only = article.content_only.replace('<img width="534"','<img width="400"')
|
|
173
|
+ article.content_only = article.content_only.replace('<!-- atome snippet -->','')
|
|
174
|
+ article.content_only = article.content_only.replace('?syndication=131181','')
|
|
175
|
+ article.content_only = article.content_only.replace('<iframe width="544" height="306" ','<iframe width="440" height="248" ')
|
|
176
|
+ article.content_only = article.content_only.replace(' frameborder="0" width="534" height="320">',' frameborder="0" width="440" height="264">')
|
|
177
|
+ article.content_only = article.content_only.replace(' onload="lmd.pic(this);" onerror="lmd.pic(this);" class="lazy-retina">','>')
|
|
178
|
+ article.content_only = article.content_only.replace('<figcaption class="legende" data-caption="','<br><em>')
|
|
179
|
+ article.content_only = article.content_only.replace('<figcaption class="legende">','<br><em>')
|
|
180
|
+ article.content_only = article.content_only.replace('</figcaption>','</em>')
|
|
181
|
+ article.content_only = article.content_only.replace('\\n','')
|
176
|
182
|
#~ article.content_only = article.content_only.replace('','')
|
177
|
183
|
#~ article.content_only = article.content_only.replace('','')
|
178
|
184
|
#~ article.content_only = article.content_only.replace('','')
|
179
|
185
|
#~ article.content_only = article.content_only.replace('','')
|
180
|
186
|
#~ article.content_only = article.content_only.replace('','')
|
181
|
|
- #~ article.content_only = html_slimmer(article.content_only)
|
182
|
|
- article.content_only = html_slimmer(article.content_only.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
|
183
|
|
-
|
184
|
|
-
|
|
187
|
+ #~ article.content_only = article.content_only.replace('','')
|
|
188
|
+ #~ article.content_only = article.content_only.replace('','')
|
|
189
|
+ #~ article.content_only = article.content_only.replace('','')
|
|
190
|
+ #~
|
|
191
|
+ # Cleanup
|
|
192
|
+ regexConjug = re.compile(r'<a class=\"lien_interne conjug\".*?>(.+?)</a>')
|
|
193
|
+ article.content_only = regexConjug.sub('\\1',article.content_only)
|
185
|
194
|
|
|
195
|
+ # Diet
|
|
196
|
+ #~ article.content_only = html_slimmer(article.content_only.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
|
|
197
|
+ #~ article.content_only = article.content_only.encode("utf-8")
|
|
198
|
+
|
186
|
199
|
cpt_prev=cpt-1
|
187
|
200
|
if cpt_prev < 0:
|
188
|
201
|
cpt_prev = 0
|