Showing 37 changed files with 2271 additions and 0 deletions
+213
newsParser/__init__.py
... ...
@@ -0,0 +1,213 @@
1
+#!/usr/bin/env python3
2
+# encoding: UTF-8
3
+__author__ = 'Yanik Cawidrone'
4
+__version__ = '0.1'
5
+
6
+"""
7
+    For more see the file 'LICENSE' for copying permission.
8
+"""
9
+
10
+from .newsParser import newsDNA
11
+from .newsParser import newsFt
12
+from .newsParser import newsLeParisien
13
+from .newsParser import newsLiberation
14
+from .newsParser import newsWaPo
15
+from .newsParser import newsZDNetFr
16
+from .newsParser import newsSCMP
17
+from .newsParser import newsTelerama
18
+from .newsParser import newsCNA
19
+from .newsParser import newsViceCom
20
+from .newsParser import newsNewYorkTimes
21
+from .newsParser import newsMothershipSG
22
+from .newsParser import newsLeMonde
23
+from .newsParser import newsChallengesFr
24
+from .newsParser import newsJDD
25
+from .newsParser import newsMidiLibre
26
+from .newsParser import newsNouvelObs
27
+from .newsParser import newsHuffPost
28
+from .newsParser import newsStraitsTimes
29
+from .newsParser import newsNewYorker
30
+from .newsParser import newsLeFigaro
31
+from .newsParser import newsSudOuest
32
+from .newsParser import newsBBC
33
+from .newsParser import newsTheAtlantic
34
+from .newsParser import newsTheStarMy
35
+from .newsParser import newsNSTMy
36
+from .newsParser import newsLaDepeche
37
+from .newsParser import newsTheGuardian
38
+from .newsParser import newsBloomberg
39
+from .newsParser import newsFranceTVInfo
40
+from .newsParser import newsTheVerge
41
+from .newsParser import newsBondyBlog
42
+from .newsParser import newsFrandroidCom
43
+from .newsParser import newsBuzzfeedCom
44
+from .newsParser import newsYahooCom
45
+from .newsParser import newsBFM
46
+# ~ from .newsParser import newsTodayOnlineSG
47
+
48
+def supportedList():
49
+  current_module = __import__(__name__)
50
+  current_content = dir(current_module)
51
+  newsList = "<ul>\n"
52
+  for funcName in current_content:
53
+    if "__" not in funcName and "news" in funcName and "newsParser" not in funcName:
54
+      #newsList += "<li>"+funcName+"</li>\n"
55
+      newsList += "<li>"+funcName.replace("news","")+"</li>\n"
56
+  newsList += "</ul>\n"
57
+  return newsList
58
+  
59
+def articleElement(typeElement,content):
60
+  element=""
61
+  if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content:
62
+    #print("=================== Buzzfeed")
63
+    if typeElement is "title":
64
+      articleElementBegin ="\"headline\": \""
65
+    elif typeElement is "description":
66
+      articleElementBegin ="\"description\": \""
67
+    articleElementEnd   ="\","
68
+    indexElementBegin = content.index(articleElementBegin)
69
+    indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
70
+    element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
71
+  elif "<meta property=\"og:url\" content=\"https://www.lemonde.fr/" in content:
72
+    #print("=================== Lemonde")
73
+    articleElementBegin=""
74
+    articleElementEnd   ="\">"
75
+    if typeElement is "image":
76
+      articleElementBegin ="<meta property=\"og:image\" content=\"http"
77
+      indexElementBegin = content.index(articleElementBegin)
78
+      indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
79
+      element = "http"+content[indexElementBegin+len(articleElementBegin):indexElementEnd]
80
+    elif typeElement is "title":
81
+      articleElementBegin ="<meta property=\"og:title\" content=\""
82
+      indexElementBegin = content.index(articleElementBegin)
83
+      indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
84
+      element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
85
+  elif "\"nytimes.com\"" in content:
86
+    #print("=================== NewYorkTimes")
87
+    articleElementBegin ="<meta data-rh=\"true\" property=\"og:"+typeElement+"\" content=\""
88
+    articleElementEnd   ="\"/>"
89
+    indexElementBegin = content.index(articleElementBegin)
90
+    indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
91
+    element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
92
+  else:
93
+    #print("=================== Generic")
94
+    articleElementBegin ="<meta property=\"og:"+typeElement+"\" content=\""
95
+    articleElementBegin2 ="<meta data-rh=\"true\" property=\"og:"+typeElement+"\" content=\""
96
+    articleElementEnd   ="\" />"
97
+    articleElementEnd2   ="\"/>"
98
+    articleElementEnd3   ="\">"
99
+    try:
100
+      # ~ print("Begin Try: "+articleElementBegin)
101
+      indexElementBegin = content.index(articleElementBegin)
102
+    except:
103
+      try:
104
+        # ~ print("Begin Try: "+articleElementBegin2)
105
+        indexElementBegin = content.index(articleElementBegin2)
106
+      except:
107
+        indexElementBegin = 0
108
+    try:
109
+      print("End Try: "+articleElementEnd)
110
+      indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
111
+    except:
112
+      try:
113
+        print("End Try: "+articleElementEnd2)
114
+        indexElementEnd   = content.index(articleElementEnd2,indexElementBegin)
115
+      except:
116
+        print("End Try: "+articleElementEnd3)
117
+        indexElementEnd   = content.index(articleElementEnd3,indexElementBegin)
118
+    element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
119
+  #print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd))
120
+  #print("Element["+element+"]")
121
+  return element
122
+  
123
+def articleTitle(content):
124
+  print("newsParser.articleTitle")
125
+  return articleElement("title",content)
126
+  
127
+def articleImage(content):
128
+  print("newsParser.articleImage")
129
+  return articleElement("image",content)
130
+  
131
+def articleDescription(content):
132
+  print("newsParser.articleDescription")
133
+  return articleElement("description",content)
134
+
135
+def getArticle(url):
136
+  data_page = ""
137
+  if not url is None:
138
+    if "ft.com" in url:
139
+      data_page += newsFt.article(url)
140
+    elif "dna.fr" in url:
141
+      data_page += newsParser.newsDNA.article(url)
142
+    elif "washingtonpost.com" in url:
143
+      data_page += newsParser.newsWaPo.article(url)
144
+    elif "leparisien.fr" in url:
145
+      data_page += newsParser.newsLeParisien.article(url)
146
+    elif "liberation.fr" in url:
147
+      data_page += newsParser.newsLiberation.article(url)
148
+    elif "zdnet.fr" in url:
149
+      data_page += newsParser.newsZDNetFr.article(url)
150
+    elif "scmp.com" in url:
151
+      data_page += newsParser.newsSCMP.article(url)
152
+    elif "telerama.fr" in url:
153
+      data_page += newsParser.newsTelerama.article(url)
154
+    elif "channelnewsasia.com" in url:
155
+      data_page += newsParser.newsCNA.article(url)
156
+    elif "vice.com" in url:
157
+      data_page += newsParser.newsViceCom.article(url)
158
+    elif "nytimes.com" in url:
159
+      data_page += newsParser.newsNewYorkTimes.article(url)
160
+    elif "mothership.sg" in url:
161
+      data_page += newsParser.newsMothershipSG.article(url)
162
+    elif "lemonde.fr" in url:
163
+      data_page += newsParser.newsLeMonde.article(url)
164
+    elif "lejdd.fr" in url:
165
+      data_page += newsParser.newsJDD.article(url)
166
+    elif "nouvelobs.com" in url:
167
+      data_page += newsParser.newsNouvelObs.article(url)
168
+    elif "huffingtonpost." in url:
169
+      data_page += newsParser.newsHuffPost.article(url)
170
+    elif "huffpost.com" in url:
171
+      data_page += newsParser.newsHuffPost.article(url)
172
+    elif "straitstimes.com" in url:
173
+      data_page += newsParser.newsStraitsTimes.article(url)
174
+    elif "newyorker.com" in url:
175
+      data_page += newsParser.newsNewYorker.article(url)
176
+    elif "lefigaro.fr" in url:
177
+      data_page += newsParser.newsLeFigaro.article(url)
178
+    elif "sudouest.fr" in url:
179
+      data_page += newsParser.newsSudOuest.article(url)
180
+    elif "bbc.com" in url:
181
+      data_page += newsParser.newsBBC.article(url)
182
+    elif "theatlantic.com" in url:
183
+      data_page += newsParser.newsTheAtlantic.article(url)
184
+    elif "thestar.com.my" in url:
185
+      data_page += newsParser.newsTheStarMy.article(url)
186
+    elif "challenges.fr" in url:
187
+      data_page += newsParser.newsChallengesFr.article(url)
188
+    elif "depeche.fr" in url:
189
+      data_page += newsParser.newsLaDepeche.article(url)
190
+    elif "guardian.com" in url or "guardian.co.uk" in url:
191
+      data_page += newsParser.newsTheGuardian.article(url)
192
+    elif "bloomberg.com" in url:
193
+      data_page += newsParser.newsBloomberg.article(url)
194
+    elif "francetvinfo.fr" in url:
195
+      data_page += newsParser.newsFranceTVInfo.article(url)
196
+    elif "theverge.com" in url:
197
+      data_page += newsParser.newsTheVerge.article(url)
198
+    elif "bondyblog.fr" in url:
199
+      data_page += newsParser.newsBondyBlog.article(url)
200
+    elif "frandroid.com" in url:
201
+      data_page += newsParser.newsFrandroidCom.article(url)
202
+    elif "buzzfeed.com" in url or "buzzfeednews.com" in url:
203
+      data_page += newsParser.newsBuzzfeedCom.article(url)
204
+    elif "news.yahoo.com" in url or "afp.com" in url:
205
+      data_page += newsParser.newsYahooCom.article(url)
206
+    elif "bfmtv.com" in url:
207
+      data_page += newsParser.newsBFM.article(url)
208
+    else:
209
+       data_page += "<p>Unsupported News, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
210
+       data_page += "<p>Supported News:"
211
+       data_page += supportedList()
212
+       data_page += "</p>\n"
213
+  return data_page
+53
newsParser/newsParser/newsBBC.py
... ...
@@ -0,0 +1,53 @@
1
+from userio import *
2
+import requests
3
+import re
4
+
5
+def article(url):
6
+  say("Article: "+url)
7
+  url = url.replace("www.midilibre.fr","www.midilibre.fr/amp")
8
+  r = requests.get(url, allow_redirects=True)
9
+  content = r.text
10
+
11
+  articleStrImageUrl = newsParser.articleImage(content)
12
+  articleStrTitle = newsParser.articleTitle(content)
13
+  articleStrDescription = newsParser.articleDescription(content)
14
+  
15
+  pageContent = ""
16
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
17
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
18
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
19
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
20
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
21
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
22
+  
23
+  articleCstBegin = "<article "
24
+  articleCstEnd   = "<div class=\"article-full__footer\">"
25
+  articleCstEnd2   = "<section data-component=\"tag-list\""
26
+  articleCstEnd3   = "</article>"
27
+  indexBegin = content.index(articleCstBegin)
28
+  try:
29
+    indexEnd   = content.index(articleCstEnd)
30
+  except:
31
+    try:
32
+      indexEnd   = content.index(articleCstEnd2)
33
+    except:
34
+      indexEnd   = content.index(articleCstEnd3)
35
+  article_only = content[indexBegin:indexEnd]
36
+  article_only = re.sub(r"<amp-img", '<img', article_only)
37
+  article_only = re.sub(r"</amp-img>", '', article_only)
38
+  article_only = re.sub(r"<h2", '<h3', article_only)
39
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
40
+  article_only = re.sub(r"<h1", '<h2', article_only)
41
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
42
+  article_only = re.sub(r"<div id=\"share-tools-panel\" (.*?)>Share page</div>", '', article_only)
43
+  article_only = re.sub(r"<a href=\"(.*?)\" class=\"(.*?)\">About sharing</a>", '', article_only)
44
+  article_only = article_only.replace("><", ">\n<")
45
+  article_only = re.sub(r"<span class=\"(.*?)-VisuallyHidden (.*?)\">image copyright</span>", '', article_only)
46
+  article_only = re.sub(r"<span class=\"(.*?)-VisuallyHidden (.*?)\">image caption</span>", '', article_only)
47
+  article_only = re.sub(r"<noscript>", '', article_only,re.MULTILINE)
48
+  article_only = re.sub(r"</noscript>", '', article_only,re.MULTILINE)
49
+  article_only = re.sub(r"<div class=\"(.*?)-TagShareWrapper (.*?)\">", '<div style="display: none;">', article_only,re.MULTILINE)
50
+  
51
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.bbc.com/', article_only)
52
+  pageContent += "<article>"+article_only+"</article>"
53
+  return pageContent
+45
newsParser/newsParser/newsBFM.py
... ...
@@ -0,0 +1,45 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+ 
6
+def article(url):
7
+  say("Article: "+url)
8
+  url = url.replace("dna.fr/","dna.fr/amp/")
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  pageContent = ""
12
+  articleStrTitle = newsParser.articleTitle(content)
13
+  articleStrImageUrl = newsParser.articleImage(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  articleCstBegin = "<div class=\"content_body\">"
16
+  articleCstEnd   = "<div class=\"content_body\" id=\"content_body_bottom\">"
17
+  
18
+  pageContent += "<h2>"+articleStrTitle+"</h2>\n"
19
+  pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
20
+  pageContent += "<em>"+articleStrDescription+"</em>\n"
21
+  
22
+  
23
+  pageContent = ""
24
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
25
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
26
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
27
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
28
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
29
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
30
+  
31
+  indexBegin = content.index(articleCstBegin)
32
+  indexEnd   = content.index(articleCstEnd)
33
+  article_only = ""
34
+  article_only = content[indexBegin:indexEnd]
35
+  article_only = re.sub(r"<amp-img", '<img', article_only)
36
+  article_only = re.sub(r"</amp-img>", '', article_only)
37
+  article_only = re.sub(r"<h2", '<h3', article_only)
38
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
39
+  article_only = re.sub(r"<h1", '<h2', article_only)
40
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
41
+  
42
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.bfmtv.com/', article_only)
43
+  pageContent += "<article>"+article_only+"</article>"
44
+  pageContent = pageContent.replace("><", ">\n<")
45
+  return pageContent
+53
newsParser/newsParser/newsBloomberg.py
... ...
@@ -0,0 +1,53 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
9
+  content = r.text
10
+  articleCstBegin = "<div class=\"article-content\">"
11
+  articleCstBegin2 = "<time class=\"article-timestamp\""
12
+  articleCstEnd   = "<div class=\"bottom-left-rail-touts-spacer\">"
13
+  try:
14
+    indexBegin = content.index(articleCstBegin)
15
+  except:
16
+    try:
17
+      indexBegin = content.index(articleCstBegin2)
18
+    except:
19
+      indexBegin = 0
20
+  try:
21
+    indexEnd   = content.index(articleCstEnd)
22
+  except:
23
+    indexEnd   = 0
24
+  articleStrImageUrl = newsParser.articleImage(content)
25
+  articleStrTitle = newsParser.articleTitle(content)
26
+  articleStrDescription = newsParser.articleDescription(content)
27
+  
28
+  pageContent = ""
29
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
30
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
31
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
32
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
33
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
34
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
35
+  
36
+  article_only = ""
37
+  article_only += "<h2>"+articleStrTitle+"</h2>\n"
38
+  article_only += "<em>"+articleStrDescription+"</em>\n"
39
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
40
+  article_only += content[indexBegin:indexEnd]
41
+  article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
42
+  article_only = re.sub(r"<h2", '<h3', article_only)
43
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
44
+  article_only = re.sub(r"<h1", '<h2', article_only)
45
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
46
+  article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
47
+  article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
48
+  article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
49
+  article_only = article_only.replace("><", ">\n<")
50
+  
51
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
52
+  pageContent += "<article>"+article_only+"</article>"
53
+  return pageContent
+45
newsParser/newsParser/newsBondyBlog.py
... ...
@@ -0,0 +1,45 @@
1
+from userio import *
2
+import requests
3
+import re
4
+
5
+def article(url):
6
+  say("Article: "+url)
7
+  r = requests.get(url, allow_redirects=True)
8
+  content = r.text
9
+
10
+  articleStrImageUrl = newsParser.articleImage(content)
11
+  articleStrTitle = newsParser.articleTitle(content)
12
+  articleStrDescription = newsParser.articleDescription(content)
13
+  
14
+  pageContent = ""
15
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
16
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
17
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
18
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
19
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
20
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
21
+  
22
+  articleCstBegin = "<section class=\"pageHeader\">"
23
+  articleCstEnd   = "<section class=\"pageComponents\">"
24
+  articleCstEnd2   = "<section subscriptions-section=\"content-not-granted\">"
25
+  articleCstEnd3   = "</article>"
26
+  indexBegin = content.index(articleCstBegin)
27
+  try:
28
+    indexEnd   = content.index(articleCstEnd)
29
+  except:
30
+    try:
31
+      indexEnd   = content.index(articleCstEnd2)
32
+    except:
33
+      indexEnd   = content.index(articleCstEnd3)
34
+  article_only = content[indexBegin:indexEnd]
35
+  article_only = re.sub(r"<amp-img", '<img', article_only)
36
+  article_only = re.sub(r"</amp-img>", '', article_only)
37
+  article_only = re.sub(r"<h2", '<h3', article_only)
38
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
39
+  article_only = re.sub(r"<h1", '<h2', article_only)
40
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
41
+  article_only = article_only.replace("><", ">\n<")
42
+  
43
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.bondyblog.fr/', article_only)
44
+  pageContent += "<article>"+article_only+"</article>"
45
+  return pageContent
+68
newsParser/newsParser/newsBuzzfeedCom.py
... ...
@@ -0,0 +1,68 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+  
7
+def article(url):
8
+  say("Article: "+url)
9
+  if not "/amphtml" in url:
10
+    say("Trying AMP")
11
+    url = url.replace("buzzfeednews.com/article","buzzfeednews.com/amphtml")
12
+    url = url.replace("buzzfeed.com/","buzzfeed.com/amphtml/")
13
+  url.replace("?origin=web-hf","")
14
+
15
+  r = requests.get(url, allow_redirects=True)
16
+  content = r.text
17
+  pageContent = ""
18
+  articleCstBegin = "<article "
19
+  articleCstEnd   = "<div class=\"subbuzz subbuzz-bfp\">"
20
+  articleCstEnd2   = "</article>"
21
+  articleCstEnd3   = "<div class=\"shares shares--inline"
22
+  articleStrTitle = newsParser.articleTitle(content)
23
+  articleStrDescription = newsParser.articleDescription(content)
24
+  articleStrImageUrl = newsParser.articleImage(content)
25
+  
26
+  indexBegin = content.index(articleCstBegin)
27
+  try:
28
+    indexEnd   = content.index(articleCstEnd)
29
+  except:
30
+    try:
31
+      indexEnd   = content.index(articleCstEnd2)
32
+    except:
33
+      indexEnd   = content.index(articleCstEnd3)
34
+      
35
+  pageContent = ""
36
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
37
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
38
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
39
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
40
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
41
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
42
+  article_only = ""
43
+  article_only += "<h1>"+articleStrTitle+"</h1>\n"
44
+  article_only += "<em>"+articleStrDescription+"</em>\n"
45
+  article_only += content[indexBegin:indexEnd]
46
+  article_only = re.sub(r"<amp-img", '<img', article_only)
47
+  article_only = re.sub(r"</amp-img>", '', article_only)
48
+  article_only = re.sub(r"<h2", '<h3', article_only)
49
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
50
+  article_only = re.sub(r"<h1", '<h2', article_only)
51
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
52
+  
53
+  # ~ article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
54
+  article_only = re.sub(r"<amp-social-share (.*?)>", "<amp-social-share>", article_only)
55
+  article_only = re.sub(r"<span class=\"icon icon--primary flex\">", "<span>", article_only)
56
+  article_only = re.sub(r"<title>(.*?)</title>", "", article_only)
57
+  article_only = re.sub(r"<use xlink:href=\"(.*?)\">", "<use>", article_only)
58
+  article_only = re.sub(r"<svg class=\"svg-(.*?)\">", "<svg height=\"1px\">", article_only)
59
+  article_only = re.sub(r"Share on Facebook", "", article_only)
60
+  article_only = re.sub(r"Share on Pinterest", "", article_only)
61
+  article_only = article_only.replace("><", ">\n<")
62
+  
63
+  if "buzzfeed.com" in url:
64
+    article_only = re.sub(r"href=\"\/", 'href=\"//www.buzzfeed.com/', article_only)
65
+  elif "buzzfeednews.com" in url:
66
+    article_only = re.sub(r"href=\"\/", 'href=\"//www.buzzfeednews.com/', article_only)
67
+  pageContent += "<article>"+article_only+"</article>"
68
+  return pageContent
+59
newsParser/newsParser/newsCNA.py
... ...
@@ -0,0 +1,59 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleImage(content):
7
+  articleImgBegin ="<meta property=\"og:image\" content=\""
8
+  articleImgEnd   ="\">"
9
+  indexImgBegin = content.index(articleImgBegin)
10
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
11
+  image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  return image
13
+  
14
+def article(url):
15
+  say("Article: "+url)
16
+  r = requests.get(url, allow_redirects=True)
17
+  content = r.text
18
+  pageContent = ""
19
+  articleCstBegin = "<article"
20
+  articleCstEnd   = "<footer class=\"article__footer\">"
21
+  indexBegin = content.index(articleCstBegin)
22
+  indexEnd   = content.index(articleCstEnd)
23
+  articleStrImageUrl = newsParser.articleImage(content)
24
+
25
+  articleStrTitle = newsParser.articleTitle(content)
26
+  articleStrDescription = newsParser.articleDescription(content)
27
+  
28
+  pageContent = ""
29
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
30
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
31
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
32
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
33
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
34
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
35
+
36
+  article_only = content[indexBegin:indexEnd]
37
+  article_only = re.sub(r"<amp-img", '<img', article_only)
38
+  article_only = re.sub(r"</amp-img>", '', article_only)
39
+  article_only = re.sub(r"<h2", '<h3', article_only)
40
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
41
+  article_only = re.sub(r"<h1", '<h2', article_only)
42
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
43
+  newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">"
44
+  article_only = re.sub(r"<div class=\"article__author-link\">", newImage, article_only)
45
+  article_only = re.sub(r"<span class=\"advertisement__title\">Advertisement</span>", '', article_only)
46
+  
47
+  article_only = re.sub(r"class=\"picture__image lazyload\"", '', article_only)
48
+  article_only = re.sub(r"<a class=\"addthis_button(.*)</a>", '', article_only)
49
+  article_only = re.sub(r"<div class=\"c-sharing--default is-article-top-position\"", '<div class="c-sharing--default is-article-top-position" style="display:none"', article_only)
50
+  article_only = re.sub(r"<h3 class=\"save-for-later__title\">Bookmark</h3>", '', article_only)
51
+  article_only = re.sub(r"<h2", '<h3', article_only)
52
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
53
+  article_only = re.sub(r"<h1", '<h2', article_only)
54
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
55
+  article_only = article_only.replace("><", ">\n<")
56
+  
57
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.channelnewsasia.com/', article_only)
58
+  pageContent += "<article>"+article_only+"</article>"
59
+  return pageContent
+55
newsParser/newsParser/newsChallengesFr.py
... ...
@@ -0,0 +1,55 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleAbonnes(content):
7
+  articleAbonnes = "réservé aux abonnés"
8
+  articleType = ""
9
+  try:
10
+    indexAbonnes = content.index(articleAbonnes)
11
+    articleType = "Abonn&eacute;s"
12
+  except:
13
+    articleType = ""
14
+  return articleType
15
+  
16
+def article(url):
17
+  say("Article: "+url)
18
+  r = requests.get(url, allow_redirects=True)
19
+  content = r.text
20
+  articleStrImageUrl = articleImage(content)
21
+  articleStrTitle = articleTitle(content)
22
+  articleStrImageUrl = newsParser.articleImage(content)
23
+  articleStrTitle = newsParser.articleTitle(content)
24
+  articleStrType = articleAbonnes(content)
25
+  
26
+  pageContent = ""
27
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
28
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
29
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
30
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
31
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
32
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
33
+  pageContent += "<h2>"+articleStrTitle+"</h2>\n"
34
+  pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
35
+  
36
+  articleCstBegin = "<article "
37
+  articleCstEnd   = "<div id=\"poool-widget\">"
38
+  articleCstEnd2   = "</article>"
39
+  indexBegin = content.index(articleCstBegin)
40
+  try:
41
+    indexEnd   = content.index(articleCstEnd)
42
+  except:
43
+    indexEnd   = content.index(articleCstEnd2)
44
+  article_only = content[indexBegin:indexEnd]
45
+  article_only = re.sub(r"<amp-img", '<img', article_only)
46
+  article_only = re.sub(r"</amp-img>", '', article_only)
47
+  article_only = re.sub(r"<h2", '<h3', article_only)
48
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
49
+  article_only = re.sub(r"<h1", '<h2', article_only)
50
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
51
+  
52
+  article_only = re.sub(r"href=\"\/", 'href=\"//wwww.liberation.fr/', article_only)
53
+  pageContent += article_only
54
+  pageContent += "<p>"+articleStrType+"</p>"
55
+  return pageContent
+38
newsParser/newsParser/newsDNA.py
... ...
@@ -0,0 +1,38 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  url = url.replace("dna.fr/","dna.fr/amp/")
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  
12
+  articleStrImageUrl = newsParser.articleImage(content)
13
+  articleStrTitle = newsParser.articleTitle(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  
16
+  pageContent = ""
17
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
18
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
19
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
20
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
21
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
22
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
23
+  
24
+  articleCstBegin = "<section poool-access-content amp-access=\"access\" amp-access-hide>"
25
+  articleCstEnd   = "<section amp-access=\"NOT error AND NOT access\" id=\"poool\">"
26
+  indexBegin = content.index(articleCstBegin)
27
+  indexEnd   = content.index(articleCstEnd)
28
+  article_only = content[indexBegin:indexEnd]
29
+  article_only = re.sub(r"<amp-img", '<img', article_only)
30
+  article_only = re.sub(r"</amp-img>", '', article_only)
31
+  article_only = re.sub(r"<h2", '<h3', article_only)
32
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
33
+  article_only = re.sub(r"<h1", '<h2', article_only)
34
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
35
+  
36
+  article_only = re.sub(r"href=\"\/", 'href=\"//dna.fr/', article_only)
37
+  pageContent += "<article>"+article_only+"</article>"
38
+  return pageContent
+43
newsParser/newsParser/newsFranceTVInfo.py
... ...
@@ -0,0 +1,43 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  url = url.replace(".html",".amp")
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  
12
+  articleStrImageUrl = newsParser.articleImage(content)
13
+  articleStrTitle = newsParser.articleTitle(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  
16
+  pageContent = ""
17
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
18
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
19
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
20
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
21
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
22
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
23
+  
24
+  articleCstBegin = "<article "
25
+  articleCstEnd   = "<section class=\"social-zone\">"
26
+  articleCstEnd2   = "</article"
27
+  indexBegin = content.index(articleCstBegin)
28
+  try:
29
+    indexEnd   = content.index(articleCstEnd)
30
+  except:
31
+    indexEnd   = content.index(articleCstEnd2)
32
+  article_only = content[indexBegin:indexEnd]
33
+  article_only = re.sub(r"<amp-img", '<img', article_only)
34
+  article_only = re.sub(r"</amp-img>", '', article_only)
35
+  article_only = re.sub(r"<h2", '<h3', article_only)
36
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
37
+  article_only = re.sub(r"<h1", '<h2', article_only)
38
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
39
+  
40
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.francetvinfo.fr/', article_only)
41
+  article_only = re.sub(r"src=\"\/", 'src=\"//www.francetvinfo.fr/', article_only)
42
+  pageContent += "<article>"+article_only+"</article>"
43
+  return pageContent
+48
newsParser/newsParser/newsFrandroidCom.py
... ...
@@ -0,0 +1,48 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+  
6
+def article(url):
7
+  say("Article: "+url)
8
+  r = requests.get(url, allow_redirects=True)
9
+  content = r.text
10
+  articleCstBegin = "<div class=\"article-content"
11
+  articleCstEnd   = " <p class=\"title\">"
12
+  articleCstEnd2   = "<div class=\"article-footer"
13
+  articleCstEnd3   = "</article>"
14
+  articleStrTitle = newsParser.articleTitle(content)
15
+  articleStrDescription = newsParser.articleDescription(content)
16
+  articleStrImageUrl = newsParser.articleImage(content)
17
+  
18
+  
19
+  pageContent = ""
20
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
21
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
22
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
23
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
24
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
25
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
26
+  
27
+  indexBegin = content.index(articleCstBegin)
28
+  try:
29
+    indexEnd   = content.index(articleCstEnd)
30
+  except:
31
+    try:
32
+      indexEnd   = content.index(articleCstEnd2)
33
+    except:
34
+      indexEnd   = content.index(articleCstEnd3)
35
+  article_only = ""
36
+  article_only += "<h2>"+articleStrTitle+"</h2>\n"
37
+  article_only += content[indexBegin:indexEnd]
38
+  article_only = re.sub(r"<amp-img", '<img', article_only)
39
+  article_only = re.sub(r"</amp-img>", '', article_only)
40
+  article_only = re.sub(r"<h2", '<h3', article_only)
41
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
42
+  article_only = re.sub(r"<h1", '<h2', article_only)
43
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
44
+  article_only = article_only.replace("><", ">\n<")
45
+  
46
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.frandroid.com/', article_only)
47
+  pageContent += article_only
48
+  return pageContent
+36
newsParser/newsParser/newsFt.py
... ...
@@ -0,0 +1,36 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  url = url.replace("www","amp")
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+
12
+  articleStrImageUrl = newsParser.articleImage(content)
13
+  articleStrTitle = newsParser.articleTitle(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  
16
+  pageContent = ""
17
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
18
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
19
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
20
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
21
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
22
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
23
+  
24
+  articleCstBegin = "<article"
25
+  articleCstEnd   = "</article>"
26
+  indexBegin = content.index(articleCstBegin)
27
+  indexEnd   = content.index(articleCstEnd)
28
+  article_only = content[indexBegin:indexEnd]
29
+  article_only = re.sub(r"<amp-img", '<img', article_only)
30
+  article_only = re.sub(r"</amp-img>", '', article_only)
31
+  article_only = re.sub(r"<h2", '<h3', article_only)
32
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
33
+  article_only = re.sub(r"<h1", '<h2', article_only)
34
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
35
+  pageContent += "<article>"+article_only+"</article>"
36
+  return pageContent
+46
newsParser/newsParser/newsHuffPost.py
... ...
@@ -0,0 +1,46 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
9
+  content = r.text
10
+
11
+  articleStrImageUrl = newsParser.articleImage(content)
12
+  articleStrTitle = newsParser.articleTitle(content)
13
+  articleStrDescription = newsParser.articleDescription(content)
14
+  
15
+  pageContent = ""
16
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
17
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
18
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
19
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
20
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
21
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
22
+  
23
+  articleCstBegin = "<article"
24
+  articleCstEnd   = "<div class=\"related-entries"
25
+  articleCstEnd2   = "</article>"
26
+  indexBegin = content.index(articleCstBegin)
27
+  try:
28
+    indexEnd   = content.index(articleCstEnd)
29
+  except:
30
+    indexEnd   = content.index(articleCstEnd2)
31
+  article_only = content[indexBegin:indexEnd]
32
+  article_only = re.sub(r"<amp-img", '<img', article_only)
33
+  article_only = re.sub(r"</amp-img>", '', article_only)
34
+  article_only = re.sub(r"<h2", '<h3', article_only)
35
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
36
+  article_only = re.sub(r"<h1", '<h2', article_only)
37
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
38
+  article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
39
+  article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
40
+  article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
41
+  article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only)
42
+  article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
43
+  article_only = article_only.replace("><", ">\n<")
44
+ 
45
+  pageContent += "<article>"+article_only+"</article>"
46
+  return pageContent
+56
newsParser/newsParser/newsJDD.py
... ...
@@ -0,0 +1,56 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+
7
+def articleAbonnes(content):
8
+  articleAbonnes = "ABONNÉS"
9
+  articleType = ""
10
+  try:
11
+    indexAbonnes = content.index(articleAbonnes)
12
+    articleType = "Abonn&eacute;s"
13
+  except:
14
+    articleType = ""
15
+  return articleType
16
+  
17
+def article(url):
18
+  say("Article: "+url)
19
+  r = requests.get(url, allow_redirects=True)
20
+  content = r.text
21
+  articleStrType = articleAbonnes(content)
22
+
23
+  articleStrImageUrl = newsParser.articleImage(content)
24
+  articleStrTitle = newsParser.articleTitle(content)
25
+  articleStrDescription = newsParser.articleDescription(content)
26
+  
27
+  pageContent = ""
28
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
29
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
30
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
31
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
32
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
33
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
34
+  
35
+  articleCstBegin = "<article "
36
+  articleCstEnd   = "</article>"
37
+  indexBegin = content.index(articleCstBegin)
38
+  indexEnd   = content.index(articleCstEnd)
39
+  article_only = content[indexBegin:indexEnd]
40
+  article_only = re.sub(r"<amp-img", '<img', article_only)
41
+  article_only = re.sub(r"</amp-img>", '', article_only)
42
+  article_only = re.sub(r"<h2", '<h3', article_only)
43
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
44
+  article_only = re.sub(r"<h1", '<h2', article_only)
45
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
46
+  article_only = re.sub(r"<a href=(.*?) id=\"fb_socialPageLink\" class=\"icon-Facebook\">", '<a href="">', article_only)
47
+  article_only = re.sub(r"<a href=(.*?) id=\"tw_socialPageLink\" class=\"icon-Twitter\">", '<a href="">', article_only)
48
+  article_only = re.sub(r"target=\"_self\"", 'target="new"', article_only)
49
+  article_only = re.sub(r"<div class=\"nota col-md-4\">Partager sur :</div>", '', article_only)
50
+  article_only = re.sub(r"<span class=\"hide\">\"</span>", '', article_only)
51
+  article_only = article_only.replace("><", ">\n<")
52
+  
53
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.lejdd.fr/', article_only)
54
+  pageContent += "<article>"+article_only+"</article>"
55
+  pageContent += "<p>"+articleStrType+"</p>"
56
+  return pageContent
+49
newsParser/newsParser/newsLaDepeche.py
... ...
@@ -0,0 +1,49 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  if not "/amp" in url:
9
+    say("Trying AMP")
10
+    url = url.replace("www.ladepeche.fr","www.ladepeche.fr/amp")
11
+  r = requests.get(url, allow_redirects=True)
12
+  content = r.text
13
+
14
+  articleStrImageUrl = newsParser.articleImage(content)
15
+  articleStrTitle = newsParser.articleTitle(content)
16
+  articleStrDescription = newsParser.articleDescription(content)
17
+  
18
+  pageContent = ""
19
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
20
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
21
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
22
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
23
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
24
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
25
+
26
+  articleCstBegin = "<article "
27
+  articleCstEnd   = "<div class=\"article-full__footer\">"
28
+  articleCstEnd2   = "<section subscriptions-section=\"content-not-granted\">"
29
+  articleCstEnd3   = "</article>"
30
+  indexBegin = content.index(articleCstBegin)
31
+  try:
32
+    indexEnd   = content.index(articleCstEnd)
33
+  except:
34
+    try:
35
+      indexEnd   = content.index(articleCstEnd2)
36
+    except:
37
+      indexEnd   = content.index(articleCstEnd3)
38
+  article_only = content[indexBegin:indexEnd]
39
+  article_only = re.sub(r"<amp-img", '<img', article_only)
40
+  article_only = re.sub(r"</amp-img>", '', article_only)
41
+  article_only = re.sub(r"<h2", '<h3', article_only)
42
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
43
+  article_only = re.sub(r"<h1", '<h2', article_only)
44
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
45
+  article_only = article_only.replace("><", ">\n<")
46
+  
47
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.ladepeche.fr/', article_only)
48
+  pageContent += "<article>"+article_only+"</article>"
49
+  return pageContent
+47
newsParser/newsParser/newsLeFigaro.py
... ...
@@ -0,0 +1,47 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  # ~ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  
12
+  articleStrImageUrl = newsParser.articleImage(content)
13
+  articleStrTitle = newsParser.articleTitle(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  
16
+  pageContent = ""
17
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
18
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
19
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
20
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
21
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
22
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
23
+  
24
+  articleCstBegin = "<article"
25
+  articleCstEnd   = "<div class=\"related-entries"
26
+  articleCstEnd2   = "</article>"
27
+  indexBegin = content.index(articleCstBegin)
28
+  try:
29
+    indexEnd   = content.index(articleCstEnd)
30
+  except:
31
+    indexEnd   = content.index(articleCstEnd2)
32
+  article_only = content[indexBegin:indexEnd]
33
+  article_only = re.sub(r"<amp-img", '<img', article_only)
34
+  article_only = re.sub(r"</amp-img>", '', article_only)
35
+  article_only = re.sub(r"<h2", '<h3', article_only)
36
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
37
+  article_only = re.sub(r"<h1", '<h2', article_only)
38
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
39
+  article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
40
+  article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
41
+  article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
42
+  article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only)
43
+  article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
44
+  article_only = article_only.replace("><", ">\n<")
45
+ 
46
+  pageContent += "<article>"+article_only+"</article>"
47
+  return pageContent
+81
newsParser/newsParser/newsLeMonde.py
... ...
@@ -0,0 +1,81 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleImage(content):
7
+  articleImgBegin ="<meta property=\"og:image\" content=\"http"
8
+  articleImgEnd   ="\">"
9
+  indexImgBegin = content.index(articleImgBegin)
10
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
11
+  image = "http"+content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  return image
13
+  
14
+def articleTitle(content):
15
+  articleImgBegin ="<meta property=\"og:title\" content=\""
16
+  articleImgEnd   ="\">"
17
+  indexImgBegin = content.index(articleImgBegin)
18
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
19
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
20
+  return title
21
+  
22
+def articleAbonnes(content):
23
+  articleAbonnes = "article__content--restricted"
24
+  articleType = ""
25
+  indexAbonnes = -1
26
+  try:
27
+    indexAbonnes = content.index(articleAbonnes)
28
+    articleType = "Abonn&eacute;s"
29
+  except:
30
+    articleType = ""
31
+  return articleType
32
+  
33
+def article(url):
34
+  say("Article: "+url)
35
+  # ~ url = url.replace("www.lemonde","abonnes.lemonde")
36
+  r = requests.get(url, allow_redirects=True)
37
+  content = r.text
38
+  # ~ print(content)
39
+  pageContent = ""
40
+  articleCstBegin = "<section class=\"article__content"
41
+  articleCstBegin2 = "<article "
42
+  articleCstBegin2 = "<article "
43
+  articleCstEnd   = "</article>"
44
+  try:
45
+    indexBegin = content.index(articleCstBegin)
46
+  except:
47
+    indexBegin = content.index(articleCstBegin2)
48
+  indexEnd   = content.index(articleCstEnd)
49
+  articleStrImageUrl = newsParser.articleImage(content)
50
+  articleStrTitle = newsParser.articleTitle(content)
51
+  articleStrDescription = newsParser.articleDescription(content)
52
+  articleStrType = articleAbonnes(content)
53
+  
54
+  pageContent = ""
55
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
56
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
57
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
58
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
59
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
60
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
61
+  
62
+  article_only = "<h2>"+articleStrTitle+"</h2>\n"
63
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
64
+  article_only += content[indexBegin:indexEnd]
65
+  article_only = re.sub(r"<amp-img", '<img', article_only)
66
+  article_only = re.sub(r"</amp-img>", '', article_only)
67
+  # ~ newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">"
68
+  article_only = re.sub(r"<figure class=\"article__media\">(.*?)</figure>",'', article_only)
69
+
70
+  article_only = re.sub(r"<img src=\"data(.*?)\" data-srcset=\" (.*?) 1x,(.*?)\"(.*?)>","<img src=\"\g<2>\">", article_only)
71
+  article_only = re.sub(r"</p>", "</p>\n", article_only)
72
+  article_only = re.sub(r"<h2", '<h3', article_only)
73
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
74
+  article_only = re.sub(r"<h1", '<h2', article_only)
75
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
76
+  article_only = article_only.replace("><", ">\n<")
77
+  
78
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.lemonde.fr/', article_only)
79
+  pageContent += "<article>"+article_only+"</article>"
80
+  pageContent += "<p>"+articleStrType+"</p>"
81
+  return pageContent
+41
newsParser/newsParser/newsLeParisien.py
... ...
@@ -0,0 +1,41 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  # ~ url = url.replace("dna.fr/","dna.fr/amp/")
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  
12
+  articleStrImageUrl = newsParser.articleImage(content)
13
+  articleStrTitle = newsParser.articleTitle(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  
16
+  pageContent = ""
17
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
18
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
19
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
20
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
21
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
22
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
23
+  
24
+  articleCstBegin = "<article "
25
+  # ~ articleCstEnd   = "</article>"
26
+  articleCstEnd   = "<div class=\"article-spacing\">"
27
+  indexBegin = content.index(articleCstBegin)
28
+  indexEnd   = content.index(articleCstEnd)
29
+  article_only = content[indexBegin:indexEnd]
30
+  article_only = re.sub(r"<amp-img", '<img', article_only)
31
+  article_only = re.sub(r"</amp-img>", '', article_only)
32
+  article_only = re.sub(r"<h2", '<h3', article_only)
33
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
34
+  article_only = re.sub(r"<h1", '<h2', article_only)
35
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
36
+  
37
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.leparisien.fr/', article_only)
38
+  article_only = re.sub(r"src=\"\/", 'src=\"//www.leparisien.fr/', article_only)
39
+  article_only = article_only.replace("><", ">\n<")
40
+  pageContent += "<article>"+article_only+"</article>"
41
+  return pageContent
+66
newsParser/newsParser/newsLiberation.py
... ...
@@ -0,0 +1,66 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleImage(content):
7
+  articleImgBegin ="<meta property=\"og:image\" content=\""
8
+  articleImgEnd   ="\"/>"
9
+  indexImgBegin = content.index(articleImgBegin)
10
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
11
+  image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  return image
13
+  
14
+def articleTitle(content):
15
+  articleImgBegin ="<meta property=\"og:title\" content=\""
16
+  articleImgEnd   ="\"/>"
17
+  indexImgBegin = content.index(articleImgBegin)
18
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
19
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
20
+  return title
21
+  
22
+def articleAbonnes(content):
23
+  articleAbonnes = "réservé aux abonnés"
24
+  articleType = ""
25
+  try:
26
+    indexAbonnes = content.index(articleAbonnes)
27
+    articleType = "Abonn&eacute;s"
28
+  except:
29
+    articleType = ""
30
+  return articleType
31
+  
32
+def article(url):
33
+  say("Article: "+url)
34
+  r = requests.get(url, allow_redirects=True)
35
+  content = r.text
36
+  articleStrImageUrl = newsParser.articleImage(content)
37
+  articleStrTitle = newsParser.articleTitle(content)
38
+  articleStrDescription = newsParser.articleDescription(content)
39
+  articleStrType = articleAbonnes(content)
40
+  
41
+  pageContent = ""
42
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
43
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
44
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
45
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
46
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
47
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
48
+  pageContent += "<h2>"+articleStrTitle+"</h2>\n"
49
+  pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
50
+  
51
+  articleCstBegin = "<article "
52
+  articleCstEnd   = "</article>"
53
+  indexBegin = content.index(articleCstBegin)
54
+  indexEnd   = content.index(articleCstEnd)
55
+  article_only = content[indexBegin:indexEnd]
56
+  article_only = re.sub(r"<amp-img", '<img', article_only)
57
+  article_only = re.sub(r"</amp-img>", '', article_only)
58
+  article_only = re.sub(r"<h2", '<h3', article_only)
59
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
60
+  article_only = re.sub(r"<h1", '<h2', article_only)
61
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
62
+  
63
+  article_only = re.sub(r"href=\"\/", 'href=\"//wwww.liberation.fr/', article_only)
64
+  pageContent += article_only
65
+  pageContent += "<p>"+articleStrType+"</p>"
66
+  return pageContent
+48
newsParser/newsParser/newsMidiLibre.py
... ...
@@ -0,0 +1,48 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  url = url.replace("www.midilibre.fr","www.midilibre.fr/amp")
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  
12
+  articleStrImageUrl = newsParser.articleImage(content)
13
+  articleStrTitle = newsParser.articleTitle(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  
16
+  pageContent = ""
17
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
18
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
19
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
20
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
21
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
22
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
23
+  
24
+  articleCstBegin = "<article "
25
+  # ~ articleCstEnd   = "</article>"
26
+  articleCstEnd   = "<div class=\"article-full__footer\">"
27
+  articleCstEnd2   = "<section subscriptions-section=\"content-not-granted\">"
28
+  articleCstEnd3   = "</article>"
29
+  indexBegin = content.index(articleCstBegin)
30
+  try:
31
+    indexEnd   = content.index(articleCstEnd)
32
+  except:
33
+    try:
34
+      indexEnd   = content.index(articleCstEnd2)
35
+    except:
36
+      indexEnd   = content.index(articleCstEnd3)
37
+  article_only = content[indexBegin:indexEnd]
38
+  article_only = re.sub(r"<amp-img", '<img', article_only)
39
+  article_only = re.sub(r"</amp-img>", '', article_only)
40
+  article_only = re.sub(r"<h2", '<h3', article_only)
41
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
42
+  article_only = re.sub(r"<h1", '<h2', article_only)
43
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
44
+  article_only = article_only.replace("><", ">\n<")
45
+  
46
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.midilibre.fr/', article_only)
47
+  pageContent += "<article>"+article_only+"</article>"
48
+  return pageContent
+57
newsParser/newsParser/newsMothershipSG.py
... ...
@@ -0,0 +1,57 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  r = requests.get(url, allow_redirects=True)
9
+  content = r.text
10
+
11
+  articleStrImageUrl = newsParser.articleImage(content)
12
+  articleStrTitle = newsParser.articleTitle(content)
13
+  articleStrDescription = newsParser.articleDescription(content)
14
+  
15
+  pageContent = ""
16
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
17
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
18
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
19
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
20
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
21
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
22
+  
23
+  articleCstBegin = "<div class=\"main-item\" "
24
+  articleCstEnd2   = "<div class=\"social-share bottom\">"
25
+  indexBegin = content.index(articleCstBegin)
26
+  indexEnd = content.index(articleCstEnd2,indexBegin)
27
+  articleStrImageUrl = newsParser.articleImage(content)
28
+  articleStrTitle = newsParser.articleTitle(content)
29
+  
30
+  article_only = "<h2>"+articleStrTitle+"</h2>\n"
31
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
32
+  article_only += content[indexBegin:indexEnd]
33
+  article_only = re.sub(r"<amp-img", '<img', article_only)
34
+  article_only = re.sub(r"</amp-img>", '', article_only)
35
+  article_only = re.sub(r"<h2", '<h3', article_only)
36
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
37
+  article_only = re.sub(r"<h1", '<h2', article_only)
38
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
39
+  newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">"
40
+  article_only = re.sub(r"<div class=\"article__author-link\">", newImage, article_only)
41
+  article_only = re.sub(r"<span class=\"advertisement__title\">Advertisement</span>", '', article_only)
42
+  
43
+  article_only = re.sub(r"class=\"picture__image lazyload\"", '', article_only)
44
+  article_only = re.sub(r"<a class=\"addthis_button(.*)</a>", '', article_only)
45
+  article_only = re.sub(r"<div class=\"c-sharing--default is-article-top-position\"", '<div class="c-sharing--default is-article-top-position" style="display:none"', article_only)
46
+  article_only = re.sub(r"<h3 class=\"save-for-later__title\">Bookmark</h3>", '', article_only)
47
+  article_only = re.sub(r"<h2", '<h3', article_only)
48
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
49
+  article_only = re.sub(r"<h1", '<h2', article_only)
50
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
51
+  article_only = article_only.replace("><", ">\n<")
52
+  
53
+  article_only = re.sub(r"href=\"\/", 'href=\"///mothership.sg/', article_only)
54
+  article_only = re.sub(r"src=\"\/", 'src=\"///mothership.sg/', article_only)
55
+  article_only = re.sub(r"src='\/", "src='//mothership.sg/", article_only)
56
+  pageContent += "<article>"+article_only+"</article>"
57
+  return pageContent
+58
newsParser/newsParser/newsNSTMy.py
... ...
@@ -0,0 +1,58 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import json
5
+import newsParser
6
+
7
+def article(url):
8
+  say("Article: "+url)
9
+  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
10
+  content = r.text
11
+  
12
+  articleCstBegin = "<article-component :article=\""
13
+  articleCstEnd   = "\" :nid="
14
+  indexBegin = content.index(articleCstBegin)
15
+  indexEnd   = content.index(articleCstEnd)
16
+  article_json = content[indexBegin+len(articleCstBegin):indexEnd]
17
+  article_json = article_json.replace("&quot;","\"")
18
+  article_json = article_json.replace("\/","/")
19
+  article_json = article_json.replace("&lt;","<")
20
+  article_json = article_json.replace("&gt;",">")
21
+  jsonArticle = json.loads(article_json)
22
+  
23
+  article_only = ""
24
+  articleStrImageUrl = jsonArticle['field_article_images'][0]['url']
25
+  articleStrImageCaption = jsonArticle['field_article_images'][0]['caption']
26
+  articleStrTitle = jsonArticle['title']
27
+  
28
+  articleStrDescription = newsParser.articleDescription(content)
29
+  
30
+  pageContent = ""
31
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
32
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
33
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
34
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
35
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
36
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
37
+  
38
+  article_only += "<h2>"+articleStrTitle+"</h2>\n"
39
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
40
+  if None is not articleStrImageCaption:
41
+    article_only += "<em>"+articleStrImageCaption+"</em>\n"
42
+  article_only += jsonArticle['body']
43
+  article_only = re.sub(r"<amp-img", '<img', article_only)
44
+  article_only = re.sub(r"</amp-img>", '', article_only)
45
+  article_only = re.sub(r"<h2", '<h3', article_only)
46
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
47
+  article_only = re.sub(r"<h1", '<h2', article_only)
48
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
49
+  article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
50
+  article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
51
+  article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
52
+  article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only)
53
+  article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
54
+  article_only = article_only.replace("><", ">\n<")
55
+  
56
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only)
57
+  pageContent += "<article>"+article_only+"</article>"
58
+  return pageContent
+48
newsParser/newsParser/newsNewYorkTimes.py
... ...
@@ -0,0 +1,48 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+  
6
+def article(url):
7
+  say("Article: "+url)
8
+  r = requests.get(url, allow_redirects=True)
9
+  content = r.text
10
+
11
+  articleStrImageUrl = newsParser.articleImage(content)
12
+  articleStrTitle = newsParser.articleTitle(content)
13
+  articleStrDescription = newsParser.articleDescription(content)
14
+  
15
+  pageContent = ""
16
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
17
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
18
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
19
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
20
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
21
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
22
+  
23
+  articleCstBegin = "<section name=\"articleBody\""
24
+  articleCstEnd   = "</article>"
25
+  indexBegin = content.index(articleCstBegin)
26
+  indexEnd   = content.index(articleCstEnd)
27
+  
28
+  article_only = ""
29
+  article_only += "<h2>"+articleStrTitle+"</h2>\n"
30
+  article_only += "<em>"+articleStrDescription+"</em>\n"
31
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
32
+  article_only += content[indexBegin:indexEnd]
33
+  article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
34
+  article_only = re.sub(r"<h2", '<h3', article_only)
35
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
36
+  article_only = re.sub(r"<h1", '<h2', article_only)
37
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
38
+  article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
39
+  # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
40
+  # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
41
+  article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
42
+  article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
43
+  # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only)
44
+  article_only = article_only.replace("><", ">\n<")
45
+  
46
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
47
+  pageContent += "<article>"+article_only+"</article>"
48
+  return pageContent
+56
newsParser/newsParser/newsNewYorker.py
... ...
@@ -0,0 +1,56 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  # ~ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  # ~ print(content)
12
+
13
+  articleStrImageUrl = newsParser.articleImage(content)
14
+  articleStrTitle = newsParser.articleTitle(content)
15
+  articleStrDescription = newsParser.articleDescription(content)
16
+  
17
+  pageContent = ""
18
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
19
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
20
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
21
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
22
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
23
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
24
+  
25
+  articleCstBegin = "<article"
26
+  articleCstEnd   = "<div class=\"related-entries"
27
+  articleCstEnd2   = "</article>"
28
+  indexBegin = content.index(articleCstBegin)
29
+  try:
30
+    indexEnd   = content.index(articleCstEnd)
31
+  except:
32
+    indexEnd   = content.index(articleCstEnd2)
33
+  article_only = content[indexBegin:indexEnd]
34
+  article_only = re.sub(r"<amp-img", '<img', article_only)
35
+  article_only = re.sub(r"</amp-img>", '', article_only)
36
+  article_only = re.sub(r"<h2", '<h3', article_only)
37
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
38
+  article_only = re.sub(r"<h1", '<h2', article_only)
39
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
40
+  article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
41
+  article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
42
+  article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
43
+  article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only)
44
+  article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
45
+  article_only = re.sub(r"<div>Save this story for later.</div>",'', article_only)
46
+  article_only = re.sub(r"<a class=\"sc-(.*?)byline__name-link button",'<a ', article_only)
47
+  article_only = re.sub(r"<li class=\"social-icons__list-item social-icons__list-item--print social-icons__list-item--standard thinner\">(.*?)</li>",'', article_only,re.MULTILINE)
48
+  article_only = re.sub(r"<li class=\"social-icons__list-item social-icons__list-item--bookmark social-icons__list-item--standard thinner bookmark-disabled\">(.*?)</li>",'', article_only,re.MULTILINE)
49
+  article_only = re.sub(r"<ul class=\"social-icons__list\">(.*?)</ul>",'', article_only,re.MULTILINE)
50
+  article_only = re.sub(r"<aside class=\"sc(.*?)</aside>",'', article_only,re.MULTILINE)
51
+  article_only = re.sub(r"<noscript>(.*?)</noscript>",'', article_only,re.MULTILINE)
52
+  article_only = re.sub(r"<svg class=\"icon icon-print\" width=\"17\" height=\"16\" viewBox=\"0 0 17 16\" fill=\"none\" xmlns=\"http://www.w3.org/2000/svg\">",'', article_only)
53
+  article_only = article_only.replace("><", ">\n<")
54
+ 
55
+  pageContent += "<article>"+article_only+"</article>"
56
+  return pageContent
+44
newsParser/newsParser/newsNouvelObs.py
... ...
@@ -0,0 +1,44 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  url = url.replace(".html",".amp")
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  
12
+  articleStrImageUrl = newsParser.articleImage(content)
13
+  articleStrTitle = newsParser.articleTitle(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  
16
+  pageContent = ""
17
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
18
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
19
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
20
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
21
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
22
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
23
+  
24
+  articleCstBegin = "<header class=\"article__header\">"
25
+  articleCstEnd   = "<span class=\"article-comments__headline-title\">"
26
+  articleCstEnd2   = "<div class=\"article-comments__comment-react\">"
27
+  indexBegin = content.index(articleCstBegin)
28
+  try:
29
+    indexEnd   = content.index(articleCstEnd)
30
+  except:
31
+    indexEnd   = content.index(articleCstEnd2)
32
+
33
+  article_only = content[indexBegin:indexEnd]
34
+  article_only = re.sub(r"<amp-img", '<img', article_only)
35
+  article_only = re.sub(r"</amp-img>", '', article_only)
36
+  article_only = re.sub(r"<h2", '<h3', article_only)
37
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
38
+  article_only = re.sub(r"<h1", '<h2', article_only)
39
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
40
+  article_only = article_only.replace("><", ">\n<")
41
+  
42
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.nouvelobs.com/', article_only)
43
+  pageContent += "<article>"+article_only+"</article>"
44
+  return pageContent
+139
newsParser/newsParser/newsSCMP.py
... ...
@@ -0,0 +1,139 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import json
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  r = requests.get(url, allow_redirects=True)
9
+  content = r.text
10
+
11
+  #uuid extraction
12
+  articleElementBegin ="name=\"cse_uuid\" content=\""
13
+  articleElementEnd ="\"/>"
14
+  indexElementBegin = content.index(articleElementBegin)
15
+  indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
16
+  entityUUID = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
17
+  
18
+  cstJsonBegin = "window.__APOLLO_STATE__="
19
+  cstJsonEnd   = "</script><script>"
20
+  indexBegin = content.index(cstJsonBegin)
21
+  indexBegin += len(cstJsonBegin)
22
+  indexEnd   = content.index(cstJsonEnd)
23
+  raw_only = content[indexBegin:indexEnd]
24
+  json_only = json.loads(raw_only)
25
+  
26
+  with open('data.json', 'w') as f:
27
+    json.dump(json_only, f)
28
+
29
+  applicationId = None
30
+  json_article = None
31
+  keyArticle = None
32
+  for key in json_only["contentService"]["ROOT_QUERY"]:
33
+    if "\"applicationId\":" in key:
34
+      keySplit=key.split("\"")
35
+      applicationId = keySplit[len(keySplit) - 2]
36
+      keyArticle=json_only["contentService"]["ROOT_QUERY"][key]["id"]      
37
+      
38
+
39
+
40
+  json_article=json_only["contentService"][keyArticle]
41
+  articleStrTitle = json_article["socialHeadline"]
42
+  articleStrDescription = ""
43
+  
44
+  for key in json_article["summary"]["json"]:
45
+    htmlType=key["type"]
46
+    htmlContent=key["children"][0]["data"]  
47
+    articleStrDescription+="<"+htmlType+">"+htmlContent+"</"+htmlType+">"
48
+  
49
+  pageContent = ""
50
+  pageContent += "<meta property=\"og:type\" content=\"article\">"
51
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">"
52
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">"
53
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">"
54
+  #pageContent += "<meta property=\"og:image\" content=\""+articleStrImage+"\">"
55
+  pageContent += "<article>"
56
+  pageContent += "<h2>"+articleStrTitle+"</h2>\n"
57
+  pageContent += "<em>"+articleStrDescription+"</em>\n"
58
+  
59
+  # Article Extraction attempt
60
+  keyArticle2=""
61
+  for key in json_article:
62
+    if "body({\"customContents\"" in key: 
63
+      keyArticle2=key
64
+
65
+  #say("UUID :"+entityUUID)
66
+  #say("AppID:"+applicationId)
67
+  #say("Key  :"+keyArticle)      
68
+  #say("Title:"+articleStrTitle)
69
+  #say("Desc :"+articleStrDescription)
70
+  json_article2 = json_article[keyArticle2]
71
+  # ~ with open('data3.json', 'w') as f2:
72
+    # ~ json.dump(json_article2, f2)
73
+    
74
+  cpt=0
75
+  for element in json_article2["json"]:
76
+    htmlType=element["type"]
77
+    # ~ print("Bef Element: "+htmlType)
78
+    if "ad1" in htmlType:
79
+      continue
80
+    elif "ad2" in htmlType:
81
+      continue
82
+    elif "ad3" in htmlType:
83
+      continue
84
+    elif "ad4" in htmlType:
85
+      continue
86
+    elif "ad5" in htmlType:
87
+      continue
88
+    elif "native-ads" in htmlType:
89
+      continue
90
+    elif "more-on-this" in htmlType:
91
+      continue
92
+    # ~ print("Aft Element: "+htmlType)
93
+    try:
94
+      htmlContent = element["children"]
95
+    except:
96
+      continue
97
+    pageContent += "<"+htmlType+">"
98
+
99
+    for elementChildren in htmlContent:  
100
+      htmlTypeChildren=elementChildren["type"]
101
+      if "text" == htmlTypeChildren:
102
+        pageContent += elementChildren["data"]
103
+      elif "a" == htmlTypeChildren:
104
+        href=elementChildren["attribs"]["href"]
105
+        pageContent += "<a href=\""+href+"\" target=\"new-"+str(cpt)+"\">"
106
+        pageContent += elementChildren["children"][0]["data"]
107
+        pageContent += "</"+htmlTypeChildren+">"
108
+      elif "img" == htmlTypeChildren:
109
+        src=elementChildren["attribs"]["src"]
110
+        caption=elementChildren["attribs"]["title"]
111
+        pageContent += "<img src=\""+src+"\">"
112
+        pageContent += "<figcaption><em>"+caption+"</em></figcaption>"
113
+        try:
114
+          pageContent += elementChildren["children"][0]["data"]
115
+        except:
116
+          pass
117
+      elif "iframe" == htmlTypeChildren:
118
+        src=elementChildren["attribs"]["src"]
119
+        caption=elementChildren["attribs"]["title"]
120
+        pageContent += "<iframe src=\""+src+"\">"
121
+        try:
122
+          pageContent += elementChildren["children"][0]["data"]
123
+        except:
124
+          pass
125
+        pageContent += "</iframe>"
126
+        pageContent += "<figcaption><em><a href=\""+src+"\" target=\"new-"+str(cpt)+"\">"+caption+"</a></em></figcaption>"
127
+      elif "em" == htmlTypeChildren:
128
+        pageContent += "<"+htmlTypeChildren+">"
129
+        try:
130
+          pageContent += elementChildren["children"][0]["data"]
131
+        except:
132
+          pass
133
+        pageContent += "</"+htmlTypeChildren+">"
134
+      else:
135
+        print("OTHER : "+htmlTypeChildren)
136
+    pageContent += "</"+htmlType+">\n"
137
+    cpt+=1
138
+  pageContent+="</article>"
139
+  return pageContent
+46
newsParser/newsParser/newsStraitsTimes.py
... ...
@@ -0,0 +1,46 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
9
+  content = r.text
10
+  pageContent = ""
11
+  articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\""
12
+  articleCstEnd   = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">"
13
+  indexBegin = content.index(articleCstBegin)
14
+  indexEnd   = content.index(articleCstEnd)
15
+
16
+  articleStrImageUrl = newsParser.articleImage(content)
17
+  articleStrTitle = newsParser.articleTitle(content)
18
+  articleStrDescription = newsParser.articleDescription(content)
19
+  
20
+  pageContent = ""
21
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
22
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
23
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
24
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
25
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
26
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
27
+  
28
+  article_only = "<h2>"+articleStrTitle+"</h2>\n"
29
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
30
+  article_only += content[indexBegin:indexEnd]
31
+  article_only = re.sub(r"<amp-img", '<img', article_only)
32
+  article_only = re.sub(r"</amp-img>", '', article_only)
33
+  article_only = re.sub(r"<h2", '<h3', article_only)
34
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
35
+  article_only = re.sub(r"<h1", '<h2', article_only)
36
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
37
+  article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
38
+  article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
39
+  article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
40
+  article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only)
41
+  article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
42
+  article_only = article_only.replace("><", ">\n<")
43
+  
44
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only)
45
+  pageContent += "<article>"+article_only+"</article>"
46
+  return pageContent
+64
newsParser/newsParser/newsSudOuest.py
... ...
@@ -0,0 +1,64 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+  
6
+def article(url):
7
+  say("Article: "+url)
8
+  url = url.replace("www.midilibre.fr","www.midilibre.fr/amp")
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  articleStrImageUrl = newsParser.articleImage(content)
12
+  articleStrTitle = newsParser.articleTitle(content)
13
+  articleStrDescription = newsParser.articleDescription(content)
14
+  
15
+  pageContent = ""
16
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
17
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
18
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
19
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
20
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
21
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
22
+  pageContent += "<h2>"+articleStrTitle+"</h2>\n"
23
+  pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
24
+  
25
+  articleCstBegin = "<article "
26
+  # ~ articleCstEnd   = "</article>"
27
+  articleCstEnd   = "<div class=\"article-full__footer\">"
28
+  articleCstEnd2   = "<section subscriptions-section=\"content-not-granted\">"
29
+  articleCstEnd3   = "</article>"
30
+  indexBegin = content.index(articleCstBegin)
31
+  try:
32
+    indexEnd   = content.index(articleCstEnd)
33
+  except:
34
+    try:
35
+      indexEnd   = content.index(articleCstEnd2)
36
+    except:
37
+      indexEnd   = content.index(articleCstEnd3)
38
+  article_only = content[indexBegin:indexEnd]
39
+  article_only = re.sub(r"<amp-img", '<img', article_only)
40
+  article_only = re.sub(r"</amp-img>", '', article_only)
41
+  article_only = re.sub(r"<h2", '<h3', article_only)
42
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
43
+  article_only = re.sub(r"<h1", '<h2', article_only)
44
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
45
+  article_only = re.sub(r"<path class=\"icon-base-fill\"(.*?)</path>",'', article_only,re.MULTILINE)
46
+  article_only = re.sub(r"<path class=\"icon-base-fill\"(.*?)\"/>",'', article_only)
47
+  article_only = re.sub(r"onclick=\"window.open\(&quot;https:(.*?)\);\">",'', article_only)
48
+  article_only = re.sub(r"<span class=\"text\">S'abonner</span>",'', article_only)
49
+  article_only = re.sub(r"<div id=\"pub_dfp_inread1\" class=\"pub pub_dfp pub_dfp_inread1 upto-tablet base-margin-bottom pub_with_light_background\"></div>",'', article_only,re.MULTILINE)
50
+  # ~ article_only = re.sub(r"<svg class=\"icon-share\"(.*?)</svg>",'', article_only,re.MULTILINE)
51
+  article_only = re.sub(r"href=\"mailto:\?subject=(.*?)\"",'', article_only,re.MULTILINE)
52
+  article_only = re.sub(r"<svg class=\"(.*?)\" viewBox=\"0 0 (.*?) (.*?)\">",'<svg>', article_only)
53
+  article_only = re.sub(r"<svg viewBox=\"0 0 (.*?) (.*?)\">",'<svg>', article_only)
54
+  article_only = re.sub(r"class=\"btn btn-icon btn-icon btn-disc btn-pill btn-outline\"",'', article_only)
55
+  article_only = re.sub(r"class=\"btn btn-icon btn-icon btn-disc btn-pill btn-outline upto-tablet\"",'', article_only)
56
+  article_only = re.sub(r"<button",'<button style="display:none;">', article_only)
57
+  article_only = re.sub(r"<aside class=\"social-links",'<aside style="display:none;" class="social-links', article_only)
58
+  article_only = re.sub(r"onclick=\"if\(navigator\.share\) (.*?)return false;\"    >",'', article_only)
59
+  
60
+  # ~ article_only = article_only.replace("><", ">\n<")
61
+  
62
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.sudouest.fr/', article_only)
63
+  pageContent += "<article>"+article_only+"</article>"
64
+  return pageContent
+69
newsParser/newsParser/newsTelerama.py
... ...
@@ -0,0 +1,69 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleImage(content):
7
+  articleImgBegin ="<meta property=\"og:image\" content=\""
8
+  articleImgEnd   ="\">"
9
+  indexImgBegin = content.index(articleImgBegin)
10
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
11
+  image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  return image
13
+  
14
+def articleTitle(content):
15
+  articleImgBegin ="<meta property=\"og:title\" content=\""
16
+  articleImgEnd   ="\">"
17
+  indexImgBegin = content.index(articleImgBegin)
18
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
19
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
20
+  return title
21
+  
22
+def articleAbonnes(content):
23
+  articleAbonnes = "réservé aux abonnés"
24
+  articleType = ""
25
+  try:
26
+    indexAbonnes = content.index(articleAbonnes)
27
+    articleType = "Abonn&eacute;s"
28
+  except:
29
+    articleType = ""
30
+  return articleType
31
+  
32
+def article(url):
33
+  say("Article: "+url)
34
+  r = requests.get(url, allow_redirects=True)
35
+  content = r.text
36
+  articleStrImageUrl = articleImage(content)
37
+  articleStrTitle = articleTitle(content)
38
+  articleStrType = articleAbonnes(content)
39
+  articleStrDescription = newsParser.articleDescription(content)
40
+  
41
+  pageContent = ""
42
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
43
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
44
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
45
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
46
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
47
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
48
+  pageContent += "<h2>"+articleStrTitle+"</h2>\n"
49
+  pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
50
+  
51
+  articleCstBegin = "<article "
52
+  if articleStrType is "":
53
+    articleCstEnd   = "</article>"
54
+  else:
55
+    articleCstEnd   = "Cet article est réservé aux abonnés"
56
+  indexBegin = content.index(articleCstBegin)
57
+  indexEnd   = content.index(articleCstEnd)
58
+  article_only = content[indexBegin:indexEnd]
59
+  article_only = re.sub(r"<amp-img", '<img', article_only)
60
+  article_only = re.sub(r"</amp-img>", '', article_only)
61
+  article_only = re.sub(r"<h2", '<h3', article_only)
62
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
63
+  article_only = re.sub(r"<h1", '<h2', article_only)
64
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
65
+  
66
+  article_only = re.sub(r"href=\"\/", 'href=\"//wwww.telerama.fr/', article_only)
67
+  pageContent += "<article>"+article_only+"</article>"
68
+  pageContent += "<p>"+articleStrType+"</p>"
69
+  return pageContent
+43
newsParser/newsParser/newsTheAtlantic.py
... ...
@@ -0,0 +1,43 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  url = url.replace("www.midilibre.fr","www.midilibre.fr/amp")
9
+  r = requests.get(url, allow_redirects=True)
10
+  content = r.text
11
+  
12
+  articleStrImageUrl = newsParser.articleImage(content)
13
+  articleStrTitle = newsParser.articleTitle(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  
16
+  pageContent = ""
17
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
18
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
19
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
20
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
21
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
22
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
23
+  
24
+  articleCstBegin = "<article "
25
+  articleCstEnd   = "</article>"
26
+  indexBegin = content.index(articleCstBegin)
27
+  indexEnd   = content.index(articleCstEnd,indexBegin)
28
+  article_only = content[indexBegin:indexEnd]
29
+  article_only = re.sub(r"<amp-img", '<img', article_only)
30
+  article_only = re.sub(r"</amp-img>", '', article_only)
31
+  article_only = re.sub(r"<h2", '<h3', article_only)
32
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
33
+  article_only = re.sub(r"<h1", '<h2', article_only)
34
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
35
+  article_only = re.sub(r"<gtp-ad(.*?)></gpt>",'',article_only)
36
+  article_only = article_only.replace("><", ">\n<")
37
+  #<ul class="ArticleRecirc_list__3WyEw">  
38
+  article_only = re.sub(r"<h3 class=\"ArticleRecirc_heading__(.*?)\">Recommended Reading</h3>",'',article_only)
39
+  article_only = re.sub(r"<ul class=\"ArticleRecirc_list__(.*?)\">", '<ul style="display: none;">', article_only,re.MULTILINE)
40
+  article_only = re.sub(r"<button class=\"ArticleShare_shareButton__(.*?)\" aria-haspopup=\"true\" aria-controls=\"expanded-share-kit\" aria-expanded=\"false\" aria-label=\"Open Share Menu\" data-action=\"click share - expand\">Share</button>", '', article_only)
41
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.theatlantic.com/', article_only)
42
+  pageContent += "<article>"+article_only+"</article>"
43
+  return pageContent
+74
newsParser/newsParser/newsTheGuardian.py
... ...
@@ -0,0 +1,74 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleImage(content):
7
+  articleImgBegin ="<meta property=\"og:image\" content=\""
8
+  articleImgEnd   ="\"/>"
9
+  # ~ articleImgEnd   ="\?width="
10
+  indexImgBegin = content.index(articleImgBegin)
11
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
12
+  image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
13
+  return image
14
+  
15
+def articleTitle(content):
16
+  articleImgBegin ="<meta property=\"og:title\" content=\""
17
+  articleImgEnd   ="\"/>"
18
+  indexImgBegin = content.index(articleImgBegin)
19
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
20
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
21
+  return title
22
+  
23
+def articleDescription(content):
24
+  articleImgBegin ="<meta property=\"og:description\" content=\""
25
+  articleImgEnd   ="\"/>"
26
+  indexImgBegin = content.index(articleImgBegin)
27
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
28
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
29
+  return title
30
+  
31
+  
32
+def article(url):
33
+  say("Article: "+url)
34
+  # ~ url = url.replace("www.theguardian.com","amp.theguardian.com")
35
+  r = requests.get(url, allow_redirects=True)
36
+  content = r.text
37
+
38
+  articleCstBegin = "<div class=\"article-body-commercial-selector"
39
+  articleCstEnd   = "<div id=\"slot-body-end\">"
40
+  indexBegin = content.index(articleCstBegin)
41
+  indexEnd   = content.index(articleCstEnd)
42
+  articleStrImageUrl = articleImage(content)
43
+  articleStrTitle = articleTitle(content)
44
+  articleStrDescription = articleDescription(content)
45
+  
46
+  pageContent = ""
47
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
48
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
49
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
50
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
51
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
52
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
53
+  
54
+  article_only = ""
55
+  article_only += "<h2>"+articleStrTitle+"</h2>\n"
56
+  article_only += "<em>"+articleStrDescription+"</em>\n"
57
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
58
+  article_only += content[indexBegin:indexEnd]
59
+  article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
60
+  article_only = re.sub(r"<h2", '<h3', article_only)
61
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
62
+  article_only = re.sub(r"<h1", '<h2', article_only)
63
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
64
+  article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
65
+  # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
66
+  # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
67
+  article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
68
+  article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
69
+  # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only)
70
+  article_only = article_only.replace("><", ">\n<")
71
+  
72
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
73
+  pageContent += "<article>"+article_only+"</article>"
74
+  return pageContent
+63
newsParser/newsParser/newsTheStarMy.py
... ...
@@ -0,0 +1,63 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleImage(content):
7
+  articleImgBegin ="<meta property=\"og:image\" content=\""
8
+  articleImgEnd   ="\" />"
9
+  indexImgBegin = content.index(articleImgBegin)
10
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
11
+  image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  return image
13
+  
14
+def articleTitle(content):
15
+  articleImgBegin ="<meta property=\"og:title\" content=\""
16
+  articleImgEnd   ="\" />"
17
+  indexImgBegin = content.index(articleImgBegin)
18
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
19
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
20
+  return title
21
+  
22
+def article(url):
23
+  say("Article: "+url)
24
+  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
25
+  content = r.text
26
+  pageContent = ""
27
+  articleCstBegin = "<article"
28
+  # ~ articleCstEnd   = "</article>"
29
+  articleCstEnd   = "<!-- /Pagination -->"
30
+  indexBegin = content.index(articleCstBegin)
31
+  indexEnd   = content.index(articleCstEnd)
32
+
33
+  articleStrImageUrl = articleImage(content)
34
+  articleStrTitle = articleTitle(content)
35
+  articleStrDescription = newsParser.articleDescription(content)
36
+  
37
+  pageContent = ""
38
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
39
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
40
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
41
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
42
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
43
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
44
+  
45
+  article_only = "<h2>"+articleStrTitle+"</h2>\n"
46
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
47
+  article_only += content[indexBegin:indexEnd]
48
+  article_only = re.sub(r"<amp-img", '<img', article_only)
49
+  article_only = re.sub(r"</amp-img>", '', article_only)
50
+  article_only = re.sub(r"<h2", '<h3', article_only)
51
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
52
+  article_only = re.sub(r"<h1", '<h2', article_only)
53
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
54
+  article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
55
+  article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
56
+  article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
57
+  article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only)
58
+  article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
59
+  article_only = article_only.replace("><", ">\n<")
60
+  
61
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only)
62
+  pageContent += "<article>"+article_only+"</article>"
63
+  return pageContent
+81
newsParser/newsParser/newsTheVerge.py
... ...
@@ -0,0 +1,81 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleImage(content):
7
+  articleImgBegin ="<meta property=\"og:image\" content=\""
8
+  articleImgEnd   ="\" />"
9
+  indexImgBegin = content.index(articleImgBegin)
10
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
11
+  image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  return image
13
+  
14
+def articleTitle(content):
15
+  articleImgBegin ="<meta property=\"og:title\" content=\""
16
+  articleImgEnd   ="\" />"
17
+  indexImgBegin = content.index(articleImgBegin)
18
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
19
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
20
+  return title
21
+  
22
+def articleDescription(content):
23
+  articleImgBegin ="<meta property=\"og:description\" content=\""
24
+  articleImgEnd   ="\" />"
25
+  indexImgBegin = content.index(articleImgBegin)
26
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
27
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
28
+  return title
29
+  
30
+  
31
+def article(url):
32
+  say("Article: "+url)
33
+  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
34
+  content = r.text
35
+
36
+  articleCstBegin = "<div class=\"c-entry-content \">"
37
+  articleCstBegin2 = "<article"
38
+  articleCstEnd   = "<div class=\"u-hidden-text\" id=\"formatter-datter\""
39
+  # ~ articleCstEnd   = "<section class=\"c-nextclick\">"
40
+  articleCstEnd2   = "<section class=\"c-related-list\">"
41
+  articleCstEnd3   = "</article"
42
+  
43
+  articleStrImageUrl = articleImage(content)
44
+  articleStrTitle = articleTitle(content)
45
+  articleStrDescription = articleDescription(content)
46
+  
47
+  pageContent = ""
48
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
49
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
50
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
51
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
52
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
53
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
54
+  
55
+  try:
56
+    indexBegin   = content.index(articleCstBegin)
57
+  except:
58
+    indexBegin   = content.index(articleCstBegin2)
59
+  try:
60
+    indexEnd   = content.index(articleCstEnd)
61
+  except:
62
+    try:
63
+      indexEnd   = content.index(articleCstEnd2)
64
+    except:
65
+      indexEnd   = content.index(articleCstEnd3)
66
+  article_only = ""
67
+  article_only += "<h2>"+articleStrTitle+"</h2>\n"
68
+  article_only += "<em>"+articleStrDescription+"</em>\n"
69
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
70
+  article_only += content[indexBegin:indexEnd]
71
+  article_only = re.sub(r"<amp-img", '<img', article_only)
72
+  article_only = re.sub(r"</amp-img>", '', article_only)
73
+  article_only = re.sub(r"<h2", '<h3', article_only)
74
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
75
+  article_only = re.sub(r"<h1", '<h2', article_only)
76
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
77
+  
78
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.theverge.com/', article_only)
79
+  article_only = re.sub(r"src=\"\/", 'src=\"//www.theverge.com/', article_only)
80
+  pageContent += "<article>"+article_only+"</article>"
81
+  return pageContent
+67
newsParser/newsParser/newsViceCom.py
... ...
@@ -0,0 +1,67 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleImage(content):
7
+  articleImgBegin ="<meta property=\"og:image\" content=\""
8
+  articleImgEnd   ="\"/>"
9
+  indexImgBegin = content.index(articleImgBegin)
10
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
11
+  image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  return image
13
+  
14
+def articleTitle(content):
15
+  articleImgBegin ="<meta property=\"og:title\" content=\""
16
+  articleImgEnd   ="\"/>"
17
+  indexImgBegin = content.index(articleImgBegin)
18
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
19
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
20
+  return title
21
+  
22
+  
23
+def article(url):
24
+  say("Article: "+url)
25
+  r = requests.get(url, allow_redirects=True)
26
+  content = r.text
27
+  pageContent = ""
28
+  articleCstBegin = "<div class=\"short-form__body\">"
29
+  articleCstBegin2 = "<div class=\"article__longform__content\">"
30
+  articleCstEnd   = "<div class=\"article__tagged\">"
31
+  articleCstEnd2   = "<div class=\"article__longform__tags\">"
32
+  try:
33
+    indexBegin = content.index(articleCstBegin)
34
+  except:
35
+    indexBegin = content.index(articleCstBegin2)
36
+    
37
+  try:
38
+    indexEnd = content.index(articleCstEnd)
39
+  except:
40
+    indexEnd = content.index(articleCstEnd2)
41
+    
42
+  # ~ indexEnd   = content.index(articleCstEnd)
43
+  articleStrImageUrl = articleImage(content)
44
+  articleStrTitle = articleTitle(content)
45
+  articleStrDescription = newsParser.articleDescription(content)
46
+  
47
+  pageContent = ""
48
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
49
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
50
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
51
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
52
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
53
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
54
+  
55
+  article_only = "<h2>"+articleStrTitle+"</h2>\n"
56
+  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
57
+  article_only += content[indexBegin:indexEnd]
58
+  article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
59
+  article_only = re.sub(r"<h2", '<h3', article_only)
60
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
61
+  article_only = re.sub(r"<h1", '<h2', article_only)
62
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
63
+  article_only = article_only.replace("><", ">\n<")
64
+  
65
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.vice.com/', article_only)
66
+  pageContent += "<article>"+article_only+"</article>"
67
+  return pageContent
+68
newsParser/newsParser/newsWaPo.py
... ...
@@ -0,0 +1,68 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  r = requests.get(url, allow_redirects=True)
9
+  r.encoding = r.apparent_encoding
10
+  content = r.text
11
+  
12
+  articleStrImageUrl = newsParser.articleImage(content)
13
+  articleStrTitle = newsParser.articleTitle(content)
14
+  articleStrDescription = newsParser.articleDescription(content)
15
+  articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl)
16
+  
17
+  pageContent = ""
18
+  pageContent += "<meta charset=\"utf-8\"/>"
19
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
20
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
21
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
22
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
23
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
24
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
25
+  
26
+  articleCstBegin = "<article"
27
+  articleCstEnd   = "<div class=\"mt-md\">"
28
+  articleCstEnd2   = "</article>"
29
+  indexBegin = content.index(articleCstBegin)
30
+  try:
31
+    indexEnd  = content.index(articleCstEnd)
32
+  except:
33
+    indexEnd  = content.index(articleCstEnd2) 
34
+
35
+
36
+  article_only = "<h2>"+articleStrTitle+"</h2>"
37
+  article_only = "<img src=\""+articleStrImageUrl+"\">"
38
+
39
+  article_only += content[indexBegin:indexEnd]
40
+  article_only = re.sub(r"<amp-img", '<img', article_only)
41
+  article_only = re.sub(r"</amp-img>", '', article_only)
42
+  article_only = re.sub(r"<h2", '<h3', article_only)
43
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
44
+  article_only = re.sub(r"<h1", '<h2', article_only)
45
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
46
+  # ~ article_only = re.sub(r"<div data-sc-v=\"4.23.4\" data-sc-c=\"placeholder\">Advertisement</div>", '</h2>', article_only)
47
+  article_only = re.sub(r"<div data-sc-v=\"4\.24\.3\" data-sc-c=\"placeholder\">Advertisement</div>", '', article_only)
48
+  article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only)
49
+  article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only)
50
+  article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only)
51
+  article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only)
52
+  article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only)
53
+  article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only)
54
+  article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only)
55
+  article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)
56
+  article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)
57
+  article_only = re.sub(r"<div data-qa=\"drop-cap-letter\">", '<div>', article_only)
58
+  article_only = re.sub(r"filter:blur\(10px\);", '', article_only)
59
+  article_only = re.sub(r"<div class=\"bg-pattern-1\".+?>", '<div>', article_only)
60
+  article_only = re.sub(r"<div class=\"bg-pattern-2\".+?>", '<div>', article_only)
61
+  article_only = re.sub(r"<img class=\"dn canvas-foreground\" src=\".+?\"/>", '', article_only)
62
+  article_only = re.sub(r"<div class=\"subhead .+?>", '<div>', article_only)
63
+  #article_only = re.sub(r"<canvas id=\"artboard\" style=\".+\">", '<canvas>', article_only)
64
+  #article_only = re.sub(r"", '', article_only)
65
+  article_only = article_only.replace("><", ">\n<")
66
+
67
+  pageContent += "<article>"+article_only+"</article>"
68
+  return pageContent
+66
newsParser/newsParser/newsYahooCom.py
... ...
@@ -0,0 +1,66 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def articleTitle(content):
7
+  articleImgBegin ="<meta property=\"og:title\" content=\""
8
+  articleImgEnd   ="\" />"
9
+  indexImgBegin = content.index(articleImgBegin)
10
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
11
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  return title
13
+  
14
+def articleImage(content):
15
+  articleImgBegin ="<meta property=\"og:image\" content=\""
16
+  articleImgEnd   ="\" />"
17
+  indexImgBegin = content.index(articleImgBegin)
18
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
19
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
20
+  return title
21
+  
22
+  
23
+def articleDescription(content):
24
+  articleImgBegin ="<meta property=\"og:description\" content=\""
25
+  articleImgEnd   ="\" />"
26
+  indexImgBegin = content.index(articleImgBegin)
27
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin)
28
+  title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
29
+  return title
30
+  
31
+def article(url):
32
+  say("Article: "+url)
33
+  url = url.replace("dna.fr/","dna.fr/amp/")
34
+  r = requests.get(url, allow_redirects=True)
35
+  content = r.text
36
+  articleCstBegin = "<div class=\"caas-body\">"
37
+  articleCstEnd   = "</article>"
38
+  articleStrTitle = articleTitle(content)
39
+  articleStrImageUrl = articleImage(content)
40
+  articleStrDescription = articleDescription(content)
41
+  
42
+  pageContent = ""
43
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
44
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
45
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
46
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
47
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
48
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
49
+  
50
+  pageContent += "<h2>"+articleStrTitle+"</h2>\n"
51
+  pageContent += "<em>"+articleStrDescription+"</em>\n"
52
+  pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
53
+  
54
+  indexBegin = content.index(articleCstBegin)
55
+  indexEnd   = content.index(articleCstEnd)
56
+  article_only = ""
57
+  article_only = content[indexBegin:indexEnd]
58
+  article_only = re.sub(r"<amp-img", '<img', article_only)
59
+  article_only = re.sub(r"</amp-img>", '', article_only)
60
+  article_only = re.sub(r"<h2", '<h3', article_only)
61
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
62
+  article_only = re.sub(r"<h1", '<h2', article_only)
63
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
64
+  article_only = re.sub(r"href=\"\/", 'href=\"//news.yahoo.com/', article_only)
65
+  pageContent += "<article>"+article_only+"</article>"
66
+  return pageContent
+38
newsParser/newsParser/newsZDNetFr.py
... ...
@@ -0,0 +1,38 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+
6
+def article(url):
7
+  say("Article: "+url)
8
+  r = requests.get(url, allow_redirects=True)
9
+  content = r.text
10
+  
11
+  articleStrImageUrl = newsParser.articleImage(content)
12
+  articleStrTitle = newsParser.articleTitle(content)
13
+  articleStrDescription = newsParser.articleDescription(content)
14
+  
15
+  pageContent = ""
16
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
17
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
18
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
19
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
20
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
21
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
22
+  
23
+  articleCstBegin = "<article "
24
+  articleCstEnd   = "</article>"
25
+  indexBegin = content.index(articleCstBegin)
26
+  indexEnd   = content.index(articleCstEnd)
27
+  article_only = content[indexBegin:indexEnd]
28
+  article_only = re.sub(r"<amp-img", '<img', article_only)
29
+  article_only = re.sub(r"</amp-img>", '', article_only)
30
+  article_only = re.sub(r"<h2", '<h3', article_only)
31
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
32
+  article_only = re.sub(r"<h1", '<h2', article_only)
33
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
34
+  article_only = article_only.replace("><", ">\n<")
35
+  
36
+  article_only = re.sub(r"href=\"\/", 'href=\"//www.zdnet.fr/', article_only)
37
+  pageContent += "<article>"+article_only+"</article>"
38
+  return pageContent