Showing 1 changed files with 39 additions and 26 deletions
+39 -26
newsfetch.py
... ...
@@ -15,11 +15,11 @@ verbose = False
15 15
 output_filename = 'default.html'
16 16
 rss_url = 'http://www.lemonde.fr/rss/une.xml'
17 17
 
18
-
19 18
 options, remainder = getopt.getopt(sys.argv[1:], 'o:v', ['output=', 
20 19
                                                          'verbose',
21 20
                                                          'url=',
22 21
                                                          ])
22
+
23 23
 for opt, arg in options:
24 24
     if opt in ('-o', '--output'):
25 25
         output_filename = arg
... ...
@@ -78,10 +78,6 @@ feed_details.link=d['feed']['link'].encode('utf-8').strip()
78 78
 feed_details.subtitle=d['feed']['subtitle'].encode('utf-8').strip()
79 79
 feed_details.num = len(d['entries'])
80 80
 
81
-
82
-#~ if 1 == debug:
83
-  #~ feed_details.debug_print()
84
-
85 81
 f = open(output_filename, 'w')
86 82
 f.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd\">\n")
87 83
 f.write("<html>\n")
... ...
@@ -91,20 +87,6 @@ f.write("	<meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"
91 87
 f.write("	<meta name=\"viewport\" content=\"width=450px, user-scalable=no\">\n")
92 88
 f.write("	<link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\" />\n")
93 89
 f.write("	<link rel=\"icon\" type=\"image/ico\" href=\"favicon.ico\">\n")
94
-#~ f.write("	<!-- Touch Screen Detection -->\n")
95
-#~ f.write("	<script>\n")
96
-#~ f.write("	function isTouchDevice(){\n")
97
-#~ f.write("		return true == (\"ontouchstart\" in window || window.DocumentTouch && document instanceof DocumentTouch);\n")
98
-#~ f.write("	}\n")
99
-#~ f.write("	</script>\n")
100
-#~ f.write("	<script type=\"text/javascript\">\n")
101
-#~ f.write("	/* Hack for Mobile */\n")
102
-#~ f.write("	if(isTouchDevice()===true) {\n")
103
-#~ f.write("		document.getElementById(\"img\").style.width = 400px;\n")
104
-#~ f.write("		document.getElementById(\"extract-content\").style.width = 400px;\n")
105
-#~ f.write("		document.getElementById(\"article-current\").style.width = 440px;\n")
106
-#~ f.write("	}\n")
107
-#~ f.write("	</script>\n")
108 90
 f.write("	<script>\n")
109 91
 f.write("	function onArticle(index) {\n")
110 92
 f.write("		var string_index = \"article-\"+index;\n")
... ...
@@ -131,7 +113,7 @@ for article in d.entries:
131 113
     if "enclosure" == link.rel:
132 114
       article_details.enclosure = link.href
133 115
 
134
-  # Npot Wroking as is. Generated image is too big
116
+  # Not Wroking as is. Generated image is too big
135 117
   #if article_details.enclosure is not None:
136 118
     #img_content=urllib2.urlopen(article.link).read()
137 119
     #article_details.enclosure = "data:image/jpg;base64,"+base64.b64encode(img_content)
... ...
@@ -153,6 +135,9 @@ for article in articles:
153 135
   article.content = response.read()
154 136
   soup_mysite = BeautifulSoup(article.content,"lxml")
155 137
   content = soup_mysite.find("article")
138
+  if content == None:
139
+    content = soup_mysite.find('div', attrs={'class':'article-page'})
140
+
156 141
   article.content_only = str(content)
157 142
   article.content_only = article.content_only.replace(" href=\"/", " href=\"http://www.lemonde.fr/")
158 143
   article.content_only = article.content_only.replace('<script>require(["twitter/widgets"]);</script>','')
... ...
@@ -160,6 +145,7 @@ for article in articles:
160 145
   article.content_only = article.content_only.replace('<div class="toolbar"></div>','')
161 146
   article.content_only = article.content_only.replace('<figure class="illustration_haut   " style="width: 534px">','<figure>')
162 147
   article.content_only = article.content_only.replace('<figure class="illustration_haut">','<figure>')
148
+  article.content_only = article.content_only.replace('<figure class="illustration_haut " style="width: 534px">','<figure>')
163 149
   article.content_only = article.content_only.replace('<span id="publisher" itemprop="Publisher" data-source="LE MONDE">Le Monde</span>','Le Monde')
164 150
   article.content_only = article.content_only.replace(' data-lazyload="false" ',' ')
165 151
   article.content_only = article.content_only.replace('Par<span>','Par <span>')
... ...
@@ -171,18 +157,45 @@ for article in articles:
171 157
   article.content_only = article.content_only.replace('onclick="return false;" ','')
172 158
   article.content_only = article.content_only.replace('target="_blank"','')
173 159
   article.content_only = article.content_only.replace('class="lien_interne rub"','')
174
-  regexConjug = re.compile(r'<a class=\"lien_interne conjug\".*?>(.+?)</a>')
175
-  article.content_only = regexConjug.sub('\\1',article.content_only)
160
+  article.content_only = article.content_only.replace('<a class="auteur" target="_blank" ','<a target="_blank" ')
161
+  article.content_only = article.content_only.replace('<a class="lien_interne rub" ','<a target="_blank" ')
162
+  article.content_only = article.content_only.replace('<a target=\'_blank\' onclick=\'return false;\' class=\'lien_interne conjug\' ','<a target="_blank" ')
163
+  article.content_only = article.content_only.replace('<h1 class="tt2" itemprop="Headline">','<h1>')
164
+  article.content_only = article.content_only.replace('<h2 class="taille_courante">','<h2>')
165
+  article.content_only = article.content_only.replace('<h2 class="intertitre">','<h2>')
166
+  article.content_only = article.content_only.replace('<h2 class="taille_courante">','<h2>')
167
+  article.content_only = article.content_only.replace('<p class="lire">','<p>')
168
+  article.content_only = article.content_only.replace('<p class="lire marqueur_restreint_atome">','<p>')
169
+  article.content_only = article.content_only.replace('<p class="bloc_signature">','<p>')
170
+  article.content_only = article.content_only.replace('<strong class="txt1 txt_gris_moyen">Suivre</strong>','')
171
+  article.content_only = article.content_only.replace('<br><span class="txt_gris_clair">Journaliste au Monde</span>','Journaliste au Monde')
172
+  article.content_only = article.content_only.replace('<img width="534"','<img width="400"')
173
+  article.content_only = article.content_only.replace('<!-- atome snippet -->','')
174
+  article.content_only = article.content_only.replace('?syndication=131181','')
175
+  article.content_only = article.content_only.replace('<iframe width="544" height="306" ','<iframe width="440" height="248" ')
176
+  article.content_only = article.content_only.replace(' frameborder="0" width="534" height="320">',' frameborder="0" width="440" height="264">')
177
+  article.content_only = article.content_only.replace(' onload="lmd.pic(this);" onerror="lmd.pic(this);" class="lazy-retina">','>')
178
+  article.content_only = article.content_only.replace('<figcaption class="legende" data-caption="','<br><em>')
179
+  article.content_only = article.content_only.replace('<figcaption class="legende">','<br><em>')
180
+  article.content_only = article.content_only.replace('</figcaption>','</em>')
181
+  article.content_only = article.content_only.replace('\\n','')
176 182
   #~ article.content_only = article.content_only.replace('','')
177 183
   #~ article.content_only = article.content_only.replace('','')
178 184
   #~ article.content_only = article.content_only.replace('','')
179 185
   #~ article.content_only = article.content_only.replace('','')
180 186
   #~ article.content_only = article.content_only.replace('','')
181
-  #~ article.content_only = html_slimmer(article.content_only)
182
-  article.content_only = html_slimmer(article.content_only.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
183
-  
184
-
187
+  #~ article.content_only = article.content_only.replace('','')
188
+  #~ article.content_only = article.content_only.replace('','')
189
+  #~ article.content_only = article.content_only.replace('','')
190
+  #~ 
191
+  # Cleanup
192
+  regexConjug = re.compile(r'<a class=\"lien_interne conjug\".*?>(.+?)</a>')
193
+  article.content_only = regexConjug.sub('\\1',article.content_only)
185 194
 
195
+  # Diet
196
+  #~ article.content_only = html_slimmer(article.content_only.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
197
+  #~ article.content_only = article.content_only.encode("utf-8")
198
+  
186 199
   cpt_prev=cpt-1
187 200
   if cpt_prev < 0:
188 201
     cpt_prev = 0