... | ... |
@@ -0,0 +1,213 @@ |
1 |
+#!/usr/bin/env python3 |
|
2 |
+# encoding: UTF-8 |
|
3 |
+__author__ = 'Yanik Cawidrone' |
|
4 |
+__version__ = '0.1' |
|
5 |
+ |
|
6 |
+""" |
|
7 |
+ For more see the file 'LICENSE' for copying permission. |
|
8 |
+""" |
|
9 |
+ |
|
10 |
+from .newsParser import newsDNA |
|
11 |
+from .newsParser import newsFt |
|
12 |
+from .newsParser import newsLeParisien |
|
13 |
+from .newsParser import newsLiberation |
|
14 |
+from .newsParser import newsWaPo |
|
15 |
+from .newsParser import newsZDNetFr |
|
16 |
+from .newsParser import newsSCMP |
|
17 |
+from .newsParser import newsTelerama |
|
18 |
+from .newsParser import newsCNA |
|
19 |
+from .newsParser import newsViceCom |
|
20 |
+from .newsParser import newsNewYorkTimes |
|
21 |
+from .newsParser import newsMothershipSG |
|
22 |
+from .newsParser import newsLeMonde |
|
23 |
+from .newsParser import newsChallengesFr |
|
24 |
+from .newsParser import newsJDD |
|
25 |
+from .newsParser import newsMidiLibre |
|
26 |
+from .newsParser import newsNouvelObs |
|
27 |
+from .newsParser import newsHuffPost |
|
28 |
+from .newsParser import newsStraitsTimes |
|
29 |
+from .newsParser import newsNewYorker |
|
30 |
+from .newsParser import newsLeFigaro |
|
31 |
+from .newsParser import newsSudOuest |
|
32 |
+from .newsParser import newsBBC |
|
33 |
+from .newsParser import newsTheAtlantic |
|
34 |
+from .newsParser import newsTheStarMy |
|
35 |
+from .newsParser import newsNSTMy |
|
36 |
+from .newsParser import newsLaDepeche |
|
37 |
+from .newsParser import newsTheGuardian |
|
38 |
+from .newsParser import newsBloomberg |
|
39 |
+from .newsParser import newsFranceTVInfo |
|
40 |
+from .newsParser import newsTheVerge |
|
41 |
+from .newsParser import newsBondyBlog |
|
42 |
+from .newsParser import newsFrandroidCom |
|
43 |
+from .newsParser import newsBuzzfeedCom |
|
44 |
+from .newsParser import newsYahooCom |
|
45 |
+from .newsParser import newsBFM |
|
46 |
+# ~ from .newsParser import newsTodayOnlineSG |
|
47 |
+ |
|
48 |
+def supportedList(): |
|
49 |
+ current_module = __import__(__name__) |
|
50 |
+ current_content = dir(current_module) |
|
51 |
+ newsList = "<ul>\n" |
|
52 |
+ for funcName in current_content: |
|
53 |
+ if "__" not in funcName and "news" in funcName and "newsParser" not in funcName: |
|
54 |
+ #newsList += "<li>"+funcName+"</li>\n" |
|
55 |
+ newsList += "<li>"+funcName.replace("news","")+"</li>\n" |
|
56 |
+ newsList += "</ul>\n" |
|
57 |
+ return newsList |
|
58 |
+ |
|
59 |
+def articleElement(typeElement,content): |
|
60 |
+ element="" |
|
61 |
+ if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content: |
|
62 |
+ #print("=================== Buzzfeed") |
|
63 |
+ if typeElement is "title": |
|
64 |
+ articleElementBegin ="\"headline\": \"" |
|
65 |
+ elif typeElement is "description": |
|
66 |
+ articleElementBegin ="\"description\": \"" |
|
67 |
+ articleElementEnd ="\"," |
|
68 |
+ indexElementBegin = content.index(articleElementBegin) |
|
69 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
70 |
+ element = content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
71 |
+ elif "<meta property=\"og:url\" content=\"https://www.lemonde.fr/" in content: |
|
72 |
+ #print("=================== Lemonde") |
|
73 |
+ articleElementBegin="" |
|
74 |
+ articleElementEnd ="\">" |
|
75 |
+ if typeElement is "image": |
|
76 |
+ articleElementBegin ="<meta property=\"og:image\" content=\"http" |
|
77 |
+ indexElementBegin = content.index(articleElementBegin) |
|
78 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
79 |
+ element = "http"+content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
80 |
+ elif typeElement is "title": |
|
81 |
+ articleElementBegin ="<meta property=\"og:title\" content=\"" |
|
82 |
+ indexElementBegin = content.index(articleElementBegin) |
|
83 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
84 |
+ element = content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
85 |
+ elif "\"nytimes.com\"" in content: |
|
86 |
+ #print("=================== NewYorkTimes") |
|
87 |
+ articleElementBegin ="<meta data-rh=\"true\" property=\"og:"+typeElement+"\" content=\"" |
|
88 |
+ articleElementEnd ="\"/>" |
|
89 |
+ indexElementBegin = content.index(articleElementBegin) |
|
90 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
91 |
+ element = content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
92 |
+ else: |
|
93 |
+ #print("=================== Generic") |
|
94 |
+ articleElementBegin ="<meta property=\"og:"+typeElement+"\" content=\"" |
|
95 |
+ articleElementBegin2 ="<meta data-rh=\"true\" property=\"og:"+typeElement+"\" content=\"" |
|
96 |
+ articleElementEnd ="\" />" |
|
97 |
+ articleElementEnd2 ="\"/>" |
|
98 |
+ articleElementEnd3 ="\">" |
|
99 |
+ try: |
|
100 |
+ # ~ print("Begin Try: "+articleElementBegin) |
|
101 |
+ indexElementBegin = content.index(articleElementBegin) |
|
102 |
+ except: |
|
103 |
+ try: |
|
104 |
+ # ~ print("Begin Try: "+articleElementBegin2) |
|
105 |
+ indexElementBegin = content.index(articleElementBegin2) |
|
106 |
+ except: |
|
107 |
+ indexElementBegin = 0 |
|
108 |
+ try: |
|
109 |
+ print("End Try: "+articleElementEnd) |
|
110 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
111 |
+ except: |
|
112 |
+ try: |
|
113 |
+ print("End Try: "+articleElementEnd2) |
|
114 |
+ indexElementEnd = content.index(articleElementEnd2,indexElementBegin) |
|
115 |
+ except: |
|
116 |
+ print("End Try: "+articleElementEnd3) |
|
117 |
+ indexElementEnd = content.index(articleElementEnd3,indexElementBegin) |
|
118 |
+ element = content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
119 |
+ #print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd)) |
|
120 |
+ #print("Element["+element+"]") |
|
121 |
+ return element |
|
122 |
+ |
|
123 |
+def articleTitle(content): |
|
124 |
+ print("newsParser.articleTitle") |
|
125 |
+ return articleElement("title",content) |
|
126 |
+ |
|
127 |
+def articleImage(content): |
|
128 |
+ print("newsParser.articleImage") |
|
129 |
+ return articleElement("image",content) |
|
130 |
+ |
|
131 |
+def articleDescription(content): |
|
132 |
+ print("newsParser.articleDescription") |
|
133 |
+ return articleElement("description",content) |
|
134 |
+ |
|
135 |
+def getArticle(url): |
|
136 |
+ data_page = "" |
|
137 |
+ if not url is None: |
|
138 |
+ if "ft.com" in url: |
|
139 |
+ data_page += newsFt.article(url) |
|
140 |
+ elif "dna.fr" in url: |
|
141 |
+ data_page += newsParser.newsDNA.article(url) |
|
142 |
+ elif "washingtonpost.com" in url: |
|
143 |
+ data_page += newsParser.newsWaPo.article(url) |
|
144 |
+ elif "leparisien.fr" in url: |
|
145 |
+ data_page += newsParser.newsLeParisien.article(url) |
|
146 |
+ elif "liberation.fr" in url: |
|
147 |
+ data_page += newsParser.newsLiberation.article(url) |
|
148 |
+ elif "zdnet.fr" in url: |
|
149 |
+ data_page += newsParser.newsZDNetFr.article(url) |
|
150 |
+ elif "scmp.com" in url: |
|
151 |
+ data_page += newsParser.newsSCMP.article(url) |
|
152 |
+ elif "telerama.fr" in url: |
|
153 |
+ data_page += newsParser.newsTelerama.article(url) |
|
154 |
+ elif "channelnewsasia.com" in url: |
|
155 |
+ data_page += newsParser.newsCNA.article(url) |
|
156 |
+ elif "vice.com" in url: |
|
157 |
+ data_page += newsParser.newsViceCom.article(url) |
|
158 |
+ elif "nytimes.com" in url: |
|
159 |
+ data_page += newsParser.newsNewYorkTimes.article(url) |
|
160 |
+ elif "mothership.sg" in url: |
|
161 |
+ data_page += newsParser.newsMothershipSG.article(url) |
|
162 |
+ elif "lemonde.fr" in url: |
|
163 |
+ data_page += newsParser.newsLeMonde.article(url) |
|
164 |
+ elif "lejdd.fr" in url: |
|
165 |
+ data_page += newsParser.newsJDD.article(url) |
|
166 |
+ elif "nouvelobs.com" in url: |
|
167 |
+ data_page += newsParser.newsNouvelObs.article(url) |
|
168 |
+ elif "huffingtonpost." in url: |
|
169 |
+ data_page += newsParser.newsHuffPost.article(url) |
|
170 |
+ elif "huffpost.com" in url: |
|
171 |
+ data_page += newsParser.newsHuffPost.article(url) |
|
172 |
+ elif "straitstimes.com" in url: |
|
173 |
+ data_page += newsParser.newsStraitsTimes.article(url) |
|
174 |
+ elif "newyorker.com" in url: |
|
175 |
+ data_page += newsParser.newsNewYorker.article(url) |
|
176 |
+ elif "lefigaro.fr" in url: |
|
177 |
+ data_page += newsParser.newsLeFigaro.article(url) |
|
178 |
+ elif "sudouest.fr" in url: |
|
179 |
+ data_page += newsParser.newsSudOuest.article(url) |
|
180 |
+ elif "bbc.com" in url: |
|
181 |
+ data_page += newsParser.newsBBC.article(url) |
|
182 |
+ elif "theatlantic.com" in url: |
|
183 |
+ data_page += newsParser.newsTheAtlantic.article(url) |
|
184 |
+ elif "thestar.com.my" in url: |
|
185 |
+ data_page += newsParser.newsTheStarMy.article(url) |
|
186 |
+ elif "challenges.fr" in url: |
|
187 |
+ data_page += newsParser.newsChallengesFr.article(url) |
|
188 |
+ elif "depeche.fr" in url: |
|
189 |
+ data_page += newsParser.newsLaDepeche.article(url) |
|
190 |
+ elif "guardian.com" in url or "guardian.co.uk" in url: |
|
191 |
+ data_page += newsParser.newsTheGuardian.article(url) |
|
192 |
+ elif "bloomberg.com" in url: |
|
193 |
+ data_page += newsParser.newsBloomberg.article(url) |
|
194 |
+ elif "francetvinfo.fr" in url: |
|
195 |
+ data_page += newsParser.newsFranceTVInfo.article(url) |
|
196 |
+ elif "theverge.com" in url: |
|
197 |
+ data_page += newsParser.newsTheVerge.article(url) |
|
198 |
+ elif "bondyblog.fr" in url: |
|
199 |
+ data_page += newsParser.newsBondyBlog.article(url) |
|
200 |
+ elif "frandroid.com" in url: |
|
201 |
+ data_page += newsParser.newsFrandroidCom.article(url) |
|
202 |
+ elif "buzzfeed.com" in url or "buzzfeednews.com" in url: |
|
203 |
+ data_page += newsParser.newsBuzzfeedCom.article(url) |
|
204 |
+ elif "news.yahoo.com" in url or "afp.com" in url: |
|
205 |
+ data_page += newsParser.newsYahooCom.article(url) |
|
206 |
+ elif "bfmtv.com" in url: |
|
207 |
+ data_page += newsParser.newsBFM.article(url) |
|
208 |
+ else: |
|
209 |
+ data_page += "<p>Unsupported News, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n" |
|
210 |
+ data_page += "<p>Supported News:" |
|
211 |
+ data_page += supportedList() |
|
212 |
+ data_page += "</p>\n" |
|
213 |
+ return data_page |
... | ... |
@@ -0,0 +1,53 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+ |
|
5 |
+def article(url): |
|
6 |
+ say("Article: "+url) |
|
7 |
+ url = url.replace("www.midilibre.fr","www.midilibre.fr/amp") |
|
8 |
+ r = requests.get(url, allow_redirects=True) |
|
9 |
+ content = r.text |
|
10 |
+ |
|
11 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
12 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
13 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
14 |
+ |
|
15 |
+ pageContent = "" |
|
16 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
17 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
22 |
+ |
|
23 |
+ articleCstBegin = "<article " |
|
24 |
+ articleCstEnd = "<div class=\"article-full__footer\">" |
|
25 |
+ articleCstEnd2 = "<section data-component=\"tag-list\"" |
|
26 |
+ articleCstEnd3 = "</article>" |
|
27 |
+ indexBegin = content.index(articleCstBegin) |
|
28 |
+ try: |
|
29 |
+ indexEnd = content.index(articleCstEnd) |
|
30 |
+ except: |
|
31 |
+ try: |
|
32 |
+ indexEnd = content.index(articleCstEnd2) |
|
33 |
+ except: |
|
34 |
+ indexEnd = content.index(articleCstEnd3) |
|
35 |
+ article_only = content[indexBegin:indexEnd] |
|
36 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
37 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
38 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
39 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
40 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
41 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
42 |
+ article_only = re.sub(r"<div id=\"share-tools-panel\" (.*?)>Share page</div>", '', article_only) |
|
43 |
+ article_only = re.sub(r"<a href=\"(.*?)\" class=\"(.*?)\">About sharing</a>", '', article_only) |
|
44 |
+ article_only = article_only.replace("><", ">\n<") |
|
45 |
+ article_only = re.sub(r"<span class=\"(.*?)-VisuallyHidden (.*?)\">image copyright</span>", '', article_only) |
|
46 |
+ article_only = re.sub(r"<span class=\"(.*?)-VisuallyHidden (.*?)\">image caption</span>", '', article_only) |
|
47 |
+ article_only = re.sub(r"<noscript>", '', article_only,re.MULTILINE) |
|
48 |
+ article_only = re.sub(r"</noscript>", '', article_only,re.MULTILINE) |
|
49 |
+ article_only = re.sub(r"<div class=\"(.*?)-TagShareWrapper (.*?)\">", '<div style="display: none;">', article_only,re.MULTILINE) |
|
50 |
+ |
|
51 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.bbc.com/', article_only) |
|
52 |
+ pageContent += "<article>"+article_only+"</article>" |
|
53 |
+ return pageContent |
... | ... |
@@ -0,0 +1,45 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ url = url.replace("dna.fr/","dna.fr/amp/") |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ pageContent = "" |
|
12 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
13 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ articleCstBegin = "<div class=\"content_body\">" |
|
16 |
+ articleCstEnd = "<div class=\"content_body\" id=\"content_body_bottom\">" |
|
17 |
+ |
|
18 |
+ pageContent += "<h2>"+articleStrTitle+"</h2>\n" |
|
19 |
+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n" |
|
20 |
+ pageContent += "<em>"+articleStrDescription+"</em>\n" |
|
21 |
+ |
|
22 |
+ |
|
23 |
+ pageContent = "" |
|
24 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
25 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
26 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
27 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
28 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
29 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
30 |
+ |
|
31 |
+ indexBegin = content.index(articleCstBegin) |
|
32 |
+ indexEnd = content.index(articleCstEnd) |
|
33 |
+ article_only = "" |
|
34 |
+ article_only = content[indexBegin:indexEnd] |
|
35 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
36 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
37 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
38 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
39 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
40 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
41 |
+ |
|
42 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.bfmtv.com/', article_only) |
|
43 |
+ pageContent += "<article>"+article_only+"</article>" |
|
44 |
+ pageContent = pageContent.replace("><", ">\n<") |
|
45 |
+ return pageContent |
... | ... |
@@ -0,0 +1,53 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
|
9 |
+ content = r.text |
|
10 |
+ articleCstBegin = "<div class=\"article-content\">" |
|
11 |
+ articleCstBegin2 = "<time class=\"article-timestamp\"" |
|
12 |
+ articleCstEnd = "<div class=\"bottom-left-rail-touts-spacer\">" |
|
13 |
+ try: |
|
14 |
+ indexBegin = content.index(articleCstBegin) |
|
15 |
+ except: |
|
16 |
+ try: |
|
17 |
+ indexBegin = content.index(articleCstBegin2) |
|
18 |
+ except: |
|
19 |
+ indexBegin = 0 |
|
20 |
+ try: |
|
21 |
+ indexEnd = content.index(articleCstEnd) |
|
22 |
+ except: |
|
23 |
+ indexEnd = 0 |
|
24 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
25 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
26 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
27 |
+ |
|
28 |
+ pageContent = "" |
|
29 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
30 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
31 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
32 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
33 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
34 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
35 |
+ |
|
36 |
+ article_only = "" |
|
37 |
+ article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
38 |
+ article_only += "<em>"+articleStrDescription+"</em>\n" |
|
39 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
40 |
+ article_only += content[indexBegin:indexEnd] |
|
41 |
+ article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only) |
|
42 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
43 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
44 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
45 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
46 |
+ article_only = re.sub(r"<p>Advertisement</p>", '', article_only) |
|
47 |
+ article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only) |
|
48 |
+ article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only) |
|
49 |
+ article_only = article_only.replace("><", ">\n<") |
|
50 |
+ |
|
51 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only) |
|
52 |
+ pageContent += "<article>"+article_only+"</article>" |
|
53 |
+ return pageContent |
... | ... |
@@ -0,0 +1,45 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+ |
|
5 |
+def article(url): |
|
6 |
+ say("Article: "+url) |
|
7 |
+ r = requests.get(url, allow_redirects=True) |
|
8 |
+ content = r.text |
|
9 |
+ |
|
10 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
11 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
12 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
13 |
+ |
|
14 |
+ pageContent = "" |
|
15 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
16 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
17 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
21 |
+ |
|
22 |
+ articleCstBegin = "<section class=\"pageHeader\">" |
|
23 |
+ articleCstEnd = "<section class=\"pageComponents\">" |
|
24 |
+ articleCstEnd2 = "<section subscriptions-section=\"content-not-granted\">" |
|
25 |
+ articleCstEnd3 = "</article>" |
|
26 |
+ indexBegin = content.index(articleCstBegin) |
|
27 |
+ try: |
|
28 |
+ indexEnd = content.index(articleCstEnd) |
|
29 |
+ except: |
|
30 |
+ try: |
|
31 |
+ indexEnd = content.index(articleCstEnd2) |
|
32 |
+ except: |
|
33 |
+ indexEnd = content.index(articleCstEnd3) |
|
34 |
+ article_only = content[indexBegin:indexEnd] |
|
35 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
36 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
37 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
38 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
39 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
40 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
41 |
+ article_only = article_only.replace("><", ">\n<") |
|
42 |
+ |
|
43 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.bondyblog.fr/', article_only) |
|
44 |
+ pageContent += "<article>"+article_only+"</article>" |
|
45 |
+ return pageContent |
... | ... |
@@ -0,0 +1,68 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+ |
|
7 |
+def article(url): |
|
8 |
+ say("Article: "+url) |
|
9 |
+ if not "/amphtml" in url: |
|
10 |
+ say("Trying AMP") |
|
11 |
+ url = url.replace("buzzfeednews.com/article","buzzfeednews.com/amphtml") |
|
12 |
+ url = url.replace("buzzfeed.com/","buzzfeed.com/amphtml/") |
|
13 |
+ url.replace("?origin=web-hf","") |
|
14 |
+ |
|
15 |
+ r = requests.get(url, allow_redirects=True) |
|
16 |
+ content = r.text |
|
17 |
+ pageContent = "" |
|
18 |
+ articleCstBegin = "<article " |
|
19 |
+ articleCstEnd = "<div class=\"subbuzz subbuzz-bfp\">" |
|
20 |
+ articleCstEnd2 = "</article>" |
|
21 |
+ articleCstEnd3 = "<div class=\"shares shares--inline" |
|
22 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
23 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
24 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
25 |
+ |
|
26 |
+ indexBegin = content.index(articleCstBegin) |
|
27 |
+ try: |
|
28 |
+ indexEnd = content.index(articleCstEnd) |
|
29 |
+ except: |
|
30 |
+ try: |
|
31 |
+ indexEnd = content.index(articleCstEnd2) |
|
32 |
+ except: |
|
33 |
+ indexEnd = content.index(articleCstEnd3) |
|
34 |
+ |
|
35 |
+ pageContent = "" |
|
36 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
37 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
38 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
39 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
40 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
41 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
42 |
+ article_only = "" |
|
43 |
+ article_only += "<h1>"+articleStrTitle+"</h1>\n" |
|
44 |
+ article_only += "<em>"+articleStrDescription+"</em>\n" |
|
45 |
+ article_only += content[indexBegin:indexEnd] |
|
46 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
47 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
48 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
49 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
50 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
51 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
52 |
+ |
|
53 |
+ # ~ article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only) |
|
54 |
+ article_only = re.sub(r"<amp-social-share (.*?)>", "<amp-social-share>", article_only) |
|
55 |
+ article_only = re.sub(r"<span class=\"icon icon--primary flex\">", "<span>", article_only) |
|
56 |
+ article_only = re.sub(r"<title>(.*?)</title>", "", article_only) |
|
57 |
+ article_only = re.sub(r"<use xlink:href=\"(.*?)\">", "<use>", article_only) |
|
58 |
+ article_only = re.sub(r"<svg class=\"svg-(.*?)\">", "<svg height=\"1px\">", article_only) |
|
59 |
+ article_only = re.sub(r"Share on Facebook", "", article_only) |
|
60 |
+ article_only = re.sub(r"Share on Pinterest", "", article_only) |
|
61 |
+ article_only = article_only.replace("><", ">\n<") |
|
62 |
+ |
|
63 |
+ if "buzzfeed.com" in url: |
|
64 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.buzzfeed.com/', article_only) |
|
65 |
+ elif "buzzfeednews.com" in url: |
|
66 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.buzzfeednews.com/', article_only) |
|
67 |
+ pageContent += "<article>"+article_only+"</article>" |
|
68 |
+ return pageContent |
... | ... |
@@ -0,0 +1,59 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleImage(content): |
|
7 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
8 |
+ articleImgEnd ="\">" |
|
9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ return image |
|
13 |
+ |
|
14 |
+def article(url): |
|
15 |
+ say("Article: "+url) |
|
16 |
+ r = requests.get(url, allow_redirects=True) |
|
17 |
+ content = r.text |
|
18 |
+ pageContent = "" |
|
19 |
+ articleCstBegin = "<article" |
|
20 |
+ articleCstEnd = "<footer class=\"article__footer\">" |
|
21 |
+ indexBegin = content.index(articleCstBegin) |
|
22 |
+ indexEnd = content.index(articleCstEnd) |
|
23 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
24 |
+ |
|
25 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
26 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
27 |
+ |
|
28 |
+ pageContent = "" |
|
29 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
30 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
31 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
32 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
33 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
34 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
35 |
+ |
|
36 |
+ article_only = content[indexBegin:indexEnd] |
|
37 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
38 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
39 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
40 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
41 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
42 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
43 |
+ newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">" |
|
44 |
+ article_only = re.sub(r"<div class=\"article__author-link\">", newImage, article_only) |
|
45 |
+ article_only = re.sub(r"<span class=\"advertisement__title\">Advertisement</span>", '', article_only) |
|
46 |
+ |
|
47 |
+ article_only = re.sub(r"class=\"picture__image lazyload\"", '', article_only) |
|
48 |
+ article_only = re.sub(r"<a class=\"addthis_button(.*)</a>", '', article_only) |
|
49 |
+ article_only = re.sub(r"<div class=\"c-sharing--default is-article-top-position\"", '<div class="c-sharing--default is-article-top-position" style="display:none"', article_only) |
|
50 |
+ article_only = re.sub(r"<h3 class=\"save-for-later__title\">Bookmark</h3>", '', article_only) |
|
51 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
52 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
53 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
54 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
55 |
+ article_only = article_only.replace("><", ">\n<") |
|
56 |
+ |
|
57 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.channelnewsasia.com/', article_only) |
|
58 |
+ pageContent += "<article>"+article_only+"</article>" |
|
59 |
+ return pageContent |
... | ... |
@@ -0,0 +1,55 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleAbonnes(content): |
|
7 |
+ articleAbonnes = "réservé aux abonnés" |
|
8 |
+ articleType = "" |
|
9 |
+ try: |
|
10 |
+ indexAbonnes = content.index(articleAbonnes) |
|
11 |
+ articleType = "Abonnés" |
|
12 |
+ except: |
|
13 |
+ articleType = "" |
|
14 |
+ return articleType |
|
15 |
+ |
|
16 |
+def article(url): |
|
17 |
+ say("Article: "+url) |
|
18 |
+ r = requests.get(url, allow_redirects=True) |
|
19 |
+ content = r.text |
|
20 |
+ articleStrImageUrl = articleImage(content) |
|
21 |
+ articleStrTitle = articleTitle(content) |
|
22 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
23 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
24 |
+ articleStrType = articleAbonnes(content) |
|
25 |
+ |
|
26 |
+ pageContent = "" |
|
27 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
28 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
29 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
30 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
31 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
32 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
33 |
+ pageContent += "<h2>"+articleStrTitle+"</h2>\n" |
|
34 |
+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n" |
|
35 |
+ |
|
36 |
+ articleCstBegin = "<article " |
|
37 |
+ articleCstEnd = "<div id=\"poool-widget\">" |
|
38 |
+ articleCstEnd2 = "</article>" |
|
39 |
+ indexBegin = content.index(articleCstBegin) |
|
40 |
+ try: |
|
41 |
+ indexEnd = content.index(articleCstEnd) |
|
42 |
+ except: |
|
43 |
+ indexEnd = content.index(articleCstEnd2) |
|
44 |
+ article_only = content[indexBegin:indexEnd] |
|
45 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
46 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
47 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
48 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
49 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
50 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
51 |
+ |
|
52 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//wwww.liberation.fr/', article_only) |
|
53 |
+ pageContent += article_only |
|
54 |
+ pageContent += "<p>"+articleStrType+"</p>" |
|
55 |
+ return pageContent |
... | ... |
@@ -0,0 +1,38 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ url = url.replace("dna.fr/","dna.fr/amp/") |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ |
|
16 |
+ pageContent = "" |
|
17 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
23 |
+ |
|
24 |
+ articleCstBegin = "<section poool-access-content amp-access=\"access\" amp-access-hide>" |
|
25 |
+ articleCstEnd = "<section amp-access=\"NOT error AND NOT access\" id=\"poool\">" |
|
26 |
+ indexBegin = content.index(articleCstBegin) |
|
27 |
+ indexEnd = content.index(articleCstEnd) |
|
28 |
+ article_only = content[indexBegin:indexEnd] |
|
29 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
30 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
31 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
32 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
33 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
34 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
35 |
+ |
|
36 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//dna.fr/', article_only) |
|
37 |
+ pageContent += "<article>"+article_only+"</article>" |
|
38 |
+ return pageContent |
... | ... |
@@ -0,0 +1,43 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ url = url.replace(".html",".amp") |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ |
|
16 |
+ pageContent = "" |
|
17 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
23 |
+ |
|
24 |
+ articleCstBegin = "<article " |
|
25 |
+ articleCstEnd = "<section class=\"social-zone\">" |
|
26 |
+ articleCstEnd2 = "</article" |
|
27 |
+ indexBegin = content.index(articleCstBegin) |
|
28 |
+ try: |
|
29 |
+ indexEnd = content.index(articleCstEnd) |
|
30 |
+ except: |
|
31 |
+ indexEnd = content.index(articleCstEnd2) |
|
32 |
+ article_only = content[indexBegin:indexEnd] |
|
33 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
34 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
35 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
36 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
37 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
38 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
39 |
+ |
|
40 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.francetvinfo.fr/', article_only) |
|
41 |
+ article_only = re.sub(r"src=\"\/", 'src=\"//www.francetvinfo.fr/', article_only) |
|
42 |
+ pageContent += "<article>"+article_only+"</article>" |
|
43 |
+ return pageContent |
... | ... |
@@ -0,0 +1,48 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ r = requests.get(url, allow_redirects=True) |
|
9 |
+ content = r.text |
|
10 |
+ articleCstBegin = "<div class=\"article-content" |
|
11 |
+ articleCstEnd = " <p class=\"title\">" |
|
12 |
+ articleCstEnd2 = "<div class=\"article-footer" |
|
13 |
+ articleCstEnd3 = "</article>" |
|
14 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
15 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
16 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
17 |
+ |
|
18 |
+ |
|
19 |
+ pageContent = "" |
|
20 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
23 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
24 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
25 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
26 |
+ |
|
27 |
+ indexBegin = content.index(articleCstBegin) |
|
28 |
+ try: |
|
29 |
+ indexEnd = content.index(articleCstEnd) |
|
30 |
+ except: |
|
31 |
+ try: |
|
32 |
+ indexEnd = content.index(articleCstEnd2) |
|
33 |
+ except: |
|
34 |
+ indexEnd = content.index(articleCstEnd3) |
|
35 |
+ article_only = "" |
|
36 |
+ article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
37 |
+ article_only += content[indexBegin:indexEnd] |
|
38 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
39 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
40 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
41 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
42 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
43 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
44 |
+ article_only = article_only.replace("><", ">\n<") |
|
45 |
+ |
|
46 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.frandroid.com/', article_only) |
|
47 |
+ pageContent += article_only |
|
48 |
+ return pageContent |
... | ... |
@@ -0,0 +1,36 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ url = url.replace("www","amp") |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ |
|
16 |
+ pageContent = "" |
|
17 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
23 |
+ |
|
24 |
+ articleCstBegin = "<article" |
|
25 |
+ articleCstEnd = "</article>" |
|
26 |
+ indexBegin = content.index(articleCstBegin) |
|
27 |
+ indexEnd = content.index(articleCstEnd) |
|
28 |
+ article_only = content[indexBegin:indexEnd] |
|
29 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
30 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
31 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
32 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
33 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
34 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
35 |
+ pageContent += "<article>"+article_only+"</article>" |
|
36 |
+ return pageContent |
... | ... |
@@ -0,0 +1,46 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
|
9 |
+ content = r.text |
|
10 |
+ |
|
11 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
12 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
13 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
14 |
+ |
|
15 |
+ pageContent = "" |
|
16 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
17 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
22 |
+ |
|
23 |
+ articleCstBegin = "<article" |
|
24 |
+ articleCstEnd = "<div class=\"related-entries" |
|
25 |
+ articleCstEnd2 = "</article>" |
|
26 |
+ indexBegin = content.index(articleCstBegin) |
|
27 |
+ try: |
|
28 |
+ indexEnd = content.index(articleCstEnd) |
|
29 |
+ except: |
|
30 |
+ indexEnd = content.index(articleCstEnd2) |
|
31 |
+ article_only = content[indexBegin:indexEnd] |
|
32 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
33 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
34 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
35 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
36 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
37 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
38 |
+ article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only) |
|
39 |
+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only) |
|
40 |
+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only) |
|
41 |
+ article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only) |
|
42 |
+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only) |
|
43 |
+ article_only = article_only.replace("><", ">\n<") |
|
44 |
+ |
|
45 |
+ pageContent += "<article>"+article_only+"</article>" |
|
46 |
+ return pageContent |
... | ... |
@@ -0,0 +1,56 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+ |
|
7 |
+def articleAbonnes(content): |
|
8 |
+ articleAbonnes = "ABONNÉS" |
|
9 |
+ articleType = "" |
|
10 |
+ try: |
|
11 |
+ indexAbonnes = content.index(articleAbonnes) |
|
12 |
+ articleType = "Abonnés" |
|
13 |
+ except: |
|
14 |
+ articleType = "" |
|
15 |
+ return articleType |
|
16 |
+ |
|
17 |
+def article(url): |
|
18 |
+ say("Article: "+url) |
|
19 |
+ r = requests.get(url, allow_redirects=True) |
|
20 |
+ content = r.text |
|
21 |
+ articleStrType = articleAbonnes(content) |
|
22 |
+ |
|
23 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
24 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
25 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
26 |
+ |
|
27 |
+ pageContent = "" |
|
28 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
29 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
30 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
31 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
32 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
33 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
34 |
+ |
|
35 |
+ articleCstBegin = "<article " |
|
36 |
+ articleCstEnd = "</article>" |
|
37 |
+ indexBegin = content.index(articleCstBegin) |
|
38 |
+ indexEnd = content.index(articleCstEnd) |
|
39 |
+ article_only = content[indexBegin:indexEnd] |
|
40 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
41 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
42 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
43 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
44 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
45 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
46 |
+ article_only = re.sub(r"<a href=(.*?) id=\"fb_socialPageLink\" class=\"icon-Facebook\">", '<a href="">', article_only) |
|
47 |
+ article_only = re.sub(r"<a href=(.*?) id=\"tw_socialPageLink\" class=\"icon-Twitter\">", '<a href="">', article_only) |
|
48 |
+ article_only = re.sub(r"target=\"_self\"", 'target="new"', article_only) |
|
49 |
+ article_only = re.sub(r"<div class=\"nota col-md-4\">Partager sur :</div>", '', article_only) |
|
50 |
+ article_only = re.sub(r"<span class=\"hide\">\"</span>", '', article_only) |
|
51 |
+ article_only = article_only.replace("><", ">\n<") |
|
52 |
+ |
|
53 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.lejdd.fr/', article_only) |
|
54 |
+ pageContent += "<article>"+article_only+"</article>" |
|
55 |
+ pageContent += "<p>"+articleStrType+"</p>" |
|
56 |
+ return pageContent |
... | ... |
@@ -0,0 +1,49 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ if not "/amp" in url: |
|
9 |
+ say("Trying AMP") |
|
10 |
+ url = url.replace("www.ladepeche.fr","www.ladepeche.fr/amp") |
|
11 |
+ r = requests.get(url, allow_redirects=True) |
|
12 |
+ content = r.text |
|
13 |
+ |
|
14 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
15 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
16 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
17 |
+ |
|
18 |
+ pageContent = "" |
|
19 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
23 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
24 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
25 |
+ |
|
26 |
+ articleCstBegin = "<article " |
|
27 |
+ articleCstEnd = "<div class=\"article-full__footer\">" |
|
28 |
+ articleCstEnd2 = "<section subscriptions-section=\"content-not-granted\">" |
|
29 |
+ articleCstEnd3 = "</article>" |
|
30 |
+ indexBegin = content.index(articleCstBegin) |
|
31 |
+ try: |
|
32 |
+ indexEnd = content.index(articleCstEnd) |
|
33 |
+ except: |
|
34 |
+ try: |
|
35 |
+ indexEnd = content.index(articleCstEnd2) |
|
36 |
+ except: |
|
37 |
+ indexEnd = content.index(articleCstEnd3) |
|
38 |
+ article_only = content[indexBegin:indexEnd] |
|
39 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
40 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
41 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
42 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
43 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
44 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
45 |
+ article_only = article_only.replace("><", ">\n<") |
|
46 |
+ |
|
47 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.ladepeche.fr/', article_only) |
|
48 |
+ pageContent += "<article>"+article_only+"</article>" |
|
49 |
+ return pageContent |
... | ... |
@@ -0,0 +1,47 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ # ~ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ |
|
16 |
+ pageContent = "" |
|
17 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
23 |
+ |
|
24 |
+ articleCstBegin = "<article" |
|
25 |
+ articleCstEnd = "<div class=\"related-entries" |
|
26 |
+ articleCstEnd2 = "</article>" |
|
27 |
+ indexBegin = content.index(articleCstBegin) |
|
28 |
+ try: |
|
29 |
+ indexEnd = content.index(articleCstEnd) |
|
30 |
+ except: |
|
31 |
+ indexEnd = content.index(articleCstEnd2) |
|
32 |
+ article_only = content[indexBegin:indexEnd] |
|
33 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
34 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
35 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
36 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
37 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
38 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
39 |
+ article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only) |
|
40 |
+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only) |
|
41 |
+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only) |
|
42 |
+ article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only) |
|
43 |
+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only) |
|
44 |
+ article_only = article_only.replace("><", ">\n<") |
|
45 |
+ |
|
46 |
+ pageContent += "<article>"+article_only+"</article>" |
|
47 |
+ return pageContent |
... | ... |
@@ -0,0 +1,81 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleImage(content): |
|
7 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"http" |
|
8 |
+ articleImgEnd ="\">" |
|
9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
11 |
+ image = "http"+content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ return image |
|
13 |
+ |
|
14 |
+def articleTitle(content): |
|
15 |
+ articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
16 |
+ articleImgEnd ="\">" |
|
17 |
+ indexImgBegin = content.index(articleImgBegin) |
|
18 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
19 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
20 |
+ return title |
|
21 |
+ |
|
22 |
+def articleAbonnes(content): |
|
23 |
+ articleAbonnes = "article__content--restricted" |
|
24 |
+ articleType = "" |
|
25 |
+ indexAbonnes = -1 |
|
26 |
+ try: |
|
27 |
+ indexAbonnes = content.index(articleAbonnes) |
|
28 |
+ articleType = "Abonnés" |
|
29 |
+ except: |
|
30 |
+ articleType = "" |
|
31 |
+ return articleType |
|
32 |
+ |
|
33 |
+def article(url): |
|
34 |
+ say("Article: "+url) |
|
35 |
+ # ~ url = url.replace("www.lemonde","abonnes.lemonde") |
|
36 |
+ r = requests.get(url, allow_redirects=True) |
|
37 |
+ content = r.text |
|
38 |
+ # ~ print(content) |
|
39 |
+ pageContent = "" |
|
40 |
+ articleCstBegin = "<section class=\"article__content" |
|
41 |
+ articleCstBegin2 = "<article " |
|
42 |
+ articleCstBegin2 = "<article " |
|
43 |
+ articleCstEnd = "</article>" |
|
44 |
+ try: |
|
45 |
+ indexBegin = content.index(articleCstBegin) |
|
46 |
+ except: |
|
47 |
+ indexBegin = content.index(articleCstBegin2) |
|
48 |
+ indexEnd = content.index(articleCstEnd) |
|
49 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
50 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
51 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
52 |
+ articleStrType = articleAbonnes(content) |
|
53 |
+ |
|
54 |
+ pageContent = "" |
|
55 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
56 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
57 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
58 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
59 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
60 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
61 |
+ |
|
62 |
+ article_only = "<h2>"+articleStrTitle+"</h2>\n" |
|
63 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
64 |
+ article_only += content[indexBegin:indexEnd] |
|
65 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
66 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
67 |
+ # ~ newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">" |
|
68 |
+ article_only = re.sub(r"<figure class=\"article__media\">(.*?)</figure>",'', article_only) |
|
69 |
+ |
|
70 |
+ article_only = re.sub(r"<img src=\"data(.*?)\" data-srcset=\" (.*?) 1x,(.*?)\"(.*?)>","<img src=\"\g<2>\">", article_only) |
|
71 |
+ article_only = re.sub(r"</p>", "</p>\n", article_only) |
|
72 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
73 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
74 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
75 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
76 |
+ article_only = article_only.replace("><", ">\n<") |
|
77 |
+ |
|
78 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.lemonde.fr/', article_only) |
|
79 |
+ pageContent += "<article>"+article_only+"</article>" |
|
80 |
+ pageContent += "<p>"+articleStrType+"</p>" |
|
81 |
+ return pageContent |
... | ... |
@@ -0,0 +1,41 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ # ~ url = url.replace("dna.fr/","dna.fr/amp/") |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ |
|
16 |
+ pageContent = "" |
|
17 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
23 |
+ |
|
24 |
+ articleCstBegin = "<article " |
|
25 |
+ # ~ articleCstEnd = "</article>" |
|
26 |
+ articleCstEnd = "<div class=\"article-spacing\">" |
|
27 |
+ indexBegin = content.index(articleCstBegin) |
|
28 |
+ indexEnd = content.index(articleCstEnd) |
|
29 |
+ article_only = content[indexBegin:indexEnd] |
|
30 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
31 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
32 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
33 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
34 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
35 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
36 |
+ |
|
37 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.leparisien.fr/', article_only) |
|
38 |
+ article_only = re.sub(r"src=\"\/", 'src=\"//www.leparisien.fr/', article_only) |
|
39 |
+ article_only = article_only.replace("><", ">\n<") |
|
40 |
+ pageContent += "<article>"+article_only+"</article>" |
|
41 |
+ return pageContent |
... | ... |
@@ -0,0 +1,66 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleImage(content): |
|
7 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
8 |
+ articleImgEnd ="\"/>" |
|
9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ return image |
|
13 |
+ |
|
14 |
+def articleTitle(content): |
|
15 |
+ articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
16 |
+ articleImgEnd ="\"/>" |
|
17 |
+ indexImgBegin = content.index(articleImgBegin) |
|
18 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
19 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
20 |
+ return title |
|
21 |
+ |
|
22 |
+def articleAbonnes(content): |
|
23 |
+ articleAbonnes = "réservé aux abonnés" |
|
24 |
+ articleType = "" |
|
25 |
+ try: |
|
26 |
+ indexAbonnes = content.index(articleAbonnes) |
|
27 |
+ articleType = "Abonnés" |
|
28 |
+ except: |
|
29 |
+ articleType = "" |
|
30 |
+ return articleType |
|
31 |
+ |
|
32 |
+def article(url): |
|
33 |
+ say("Article: "+url) |
|
34 |
+ r = requests.get(url, allow_redirects=True) |
|
35 |
+ content = r.text |
|
36 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
37 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
38 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
39 |
+ articleStrType = articleAbonnes(content) |
|
40 |
+ |
|
41 |
+ pageContent = "" |
|
42 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
43 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
44 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
45 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
46 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
47 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
48 |
+ pageContent += "<h2>"+articleStrTitle+"</h2>\n" |
|
49 |
+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n" |
|
50 |
+ |
|
51 |
+ articleCstBegin = "<article " |
|
52 |
+ articleCstEnd = "</article>" |
|
53 |
+ indexBegin = content.index(articleCstBegin) |
|
54 |
+ indexEnd = content.index(articleCstEnd) |
|
55 |
+ article_only = content[indexBegin:indexEnd] |
|
56 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
57 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
58 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
59 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
60 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
61 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
62 |
+ |
|
63 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//wwww.liberation.fr/', article_only) |
|
64 |
+ pageContent += article_only |
|
65 |
+ pageContent += "<p>"+articleStrType+"</p>" |
|
66 |
+ return pageContent |
... | ... |
@@ -0,0 +1,48 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ url = url.replace("www.midilibre.fr","www.midilibre.fr/amp") |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ |
|
16 |
+ pageContent = "" |
|
17 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
23 |
+ |
|
24 |
+ articleCstBegin = "<article " |
|
25 |
+ # ~ articleCstEnd = "</article>" |
|
26 |
+ articleCstEnd = "<div class=\"article-full__footer\">" |
|
27 |
+ articleCstEnd2 = "<section subscriptions-section=\"content-not-granted\">" |
|
28 |
+ articleCstEnd3 = "</article>" |
|
29 |
+ indexBegin = content.index(articleCstBegin) |
|
30 |
+ try: |
|
31 |
+ indexEnd = content.index(articleCstEnd) |
|
32 |
+ except: |
|
33 |
+ try: |
|
34 |
+ indexEnd = content.index(articleCstEnd2) |
|
35 |
+ except: |
|
36 |
+ indexEnd = content.index(articleCstEnd3) |
|
37 |
+ article_only = content[indexBegin:indexEnd] |
|
38 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
39 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
40 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
41 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
42 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
43 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
44 |
+ article_only = article_only.replace("><", ">\n<") |
|
45 |
+ |
|
46 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.midilibre.fr/', article_only) |
|
47 |
+ pageContent += "<article>"+article_only+"</article>" |
|
48 |
+ return pageContent |
... | ... |
@@ -0,0 +1,57 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ r = requests.get(url, allow_redirects=True) |
|
9 |
+ content = r.text |
|
10 |
+ |
|
11 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
12 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
13 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
14 |
+ |
|
15 |
+ pageContent = "" |
|
16 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
17 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
22 |
+ |
|
23 |
+ articleCstBegin = "<div class=\"main-item\" " |
|
24 |
+ articleCstEnd2 = "<div class=\"social-share bottom\">" |
|
25 |
+ indexBegin = content.index(articleCstBegin) |
|
26 |
+ indexEnd = content.index(articleCstEnd2,indexBegin) |
|
27 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
28 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
29 |
+ |
|
30 |
+ article_only = "<h2>"+articleStrTitle+"</h2>\n" |
|
31 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
32 |
+ article_only += content[indexBegin:indexEnd] |
|
33 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
34 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
35 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
36 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
37 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
38 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
39 |
+ newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">" |
|
40 |
+ article_only = re.sub(r"<div class=\"article__author-link\">", newImage, article_only) |
|
41 |
+ article_only = re.sub(r"<span class=\"advertisement__title\">Advertisement</span>", '', article_only) |
|
42 |
+ |
|
43 |
+ article_only = re.sub(r"class=\"picture__image lazyload\"", '', article_only) |
|
44 |
+ article_only = re.sub(r"<a class=\"addthis_button(.*)</a>", '', article_only) |
|
45 |
+ article_only = re.sub(r"<div class=\"c-sharing--default is-article-top-position\"", '<div class="c-sharing--default is-article-top-position" style="display:none"', article_only) |
|
46 |
+ article_only = re.sub(r"<h3 class=\"save-for-later__title\">Bookmark</h3>", '', article_only) |
|
47 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
48 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
49 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
50 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
51 |
+ article_only = article_only.replace("><", ">\n<") |
|
52 |
+ |
|
53 |
+ article_only = re.sub(r"href=\"\/", 'href=\"///mothership.sg/', article_only) |
|
54 |
+ article_only = re.sub(r"src=\"\/", 'src=\"///mothership.sg/', article_only) |
|
55 |
+ article_only = re.sub(r"src='\/", "src='//mothership.sg/", article_only) |
|
56 |
+ pageContent += "<article>"+article_only+"</article>" |
|
57 |
+ return pageContent |
... | ... |
@@ -0,0 +1,58 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import json |
|
5 |
+import newsParser |
|
6 |
+ |
|
7 |
+def article(url): |
|
8 |
+ say("Article: "+url) |
|
9 |
+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleCstBegin = "<article-component :article=\"" |
|
13 |
+ articleCstEnd = "\" :nid=" |
|
14 |
+ indexBegin = content.index(articleCstBegin) |
|
15 |
+ indexEnd = content.index(articleCstEnd) |
|
16 |
+ article_json = content[indexBegin+len(articleCstBegin):indexEnd] |
|
17 |
+ article_json = article_json.replace(""","\"") |
|
18 |
+ article_json = article_json.replace("\/","/") |
|
19 |
+ article_json = article_json.replace("<","<") |
|
20 |
+ article_json = article_json.replace(">",">") |
|
21 |
+ jsonArticle = json.loads(article_json) |
|
22 |
+ |
|
23 |
+ article_only = "" |
|
24 |
+ articleStrImageUrl = jsonArticle['field_article_images'][0]['url'] |
|
25 |
+ articleStrImageCaption = jsonArticle['field_article_images'][0]['caption'] |
|
26 |
+ articleStrTitle = jsonArticle['title'] |
|
27 |
+ |
|
28 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
29 |
+ |
|
30 |
+ pageContent = "" |
|
31 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
32 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
33 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
34 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
35 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
36 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
37 |
+ |
|
38 |
+ article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
39 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
40 |
+ if None is not articleStrImageCaption: |
|
41 |
+ article_only += "<em>"+articleStrImageCaption+"</em>\n" |
|
42 |
+ article_only += jsonArticle['body'] |
|
43 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
44 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
45 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
46 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
47 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
48 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
49 |
+ article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only) |
|
50 |
+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only) |
|
51 |
+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only) |
|
52 |
+ article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only) |
|
53 |
+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only) |
|
54 |
+ article_only = article_only.replace("><", ">\n<") |
|
55 |
+ |
|
56 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only) |
|
57 |
+ pageContent += "<article>"+article_only+"</article>" |
|
58 |
+ return pageContent |
... | ... |
@@ -0,0 +1,48 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ r = requests.get(url, allow_redirects=True) |
|
9 |
+ content = r.text |
|
10 |
+ |
|
11 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
12 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
13 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
14 |
+ |
|
15 |
+ pageContent = "" |
|
16 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
17 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
22 |
+ |
|
23 |
+ articleCstBegin = "<section name=\"articleBody\"" |
|
24 |
+ articleCstEnd = "</article>" |
|
25 |
+ indexBegin = content.index(articleCstBegin) |
|
26 |
+ indexEnd = content.index(articleCstEnd) |
|
27 |
+ |
|
28 |
+ article_only = "" |
|
29 |
+ article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
30 |
+ article_only += "<em>"+articleStrDescription+"</em>\n" |
|
31 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
32 |
+ article_only += content[indexBegin:indexEnd] |
|
33 |
+ article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only) |
|
34 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
35 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
36 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
37 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
38 |
+ article_only = re.sub(r"<p>Advertisement</p>", '', article_only) |
|
39 |
+ # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only) |
|
40 |
+ # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only) |
|
41 |
+ article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only) |
|
42 |
+ article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only) |
|
43 |
+ # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only) |
|
44 |
+ article_only = article_only.replace("><", ">\n<") |
|
45 |
+ |
|
46 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only) |
|
47 |
+ pageContent += "<article>"+article_only+"</article>" |
|
48 |
+ return pageContent |
... | ... |
@@ -0,0 +1,56 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ # ~ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ # ~ print(content) |
|
12 |
+ |
|
13 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
14 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
15 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
16 |
+ |
|
17 |
+ pageContent = "" |
|
18 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
23 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
24 |
+ |
|
25 |
+ articleCstBegin = "<article" |
|
26 |
+ articleCstEnd = "<div class=\"related-entries" |
|
27 |
+ articleCstEnd2 = "</article>" |
|
28 |
+ indexBegin = content.index(articleCstBegin) |
|
29 |
+ try: |
|
30 |
+ indexEnd = content.index(articleCstEnd) |
|
31 |
+ except: |
|
32 |
+ indexEnd = content.index(articleCstEnd2) |
|
33 |
+ article_only = content[indexBegin:indexEnd] |
|
34 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
35 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
36 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
37 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
38 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
39 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
40 |
+ article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only) |
|
41 |
+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only) |
|
42 |
+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only) |
|
43 |
+ article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only) |
|
44 |
+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only) |
|
45 |
+ article_only = re.sub(r"<div>Save this story for later.</div>",'', article_only) |
|
46 |
+ article_only = re.sub(r"<a class=\"sc-(.*?)byline__name-link button",'<a ', article_only) |
|
47 |
+ article_only = re.sub(r"<li class=\"social-icons__list-item social-icons__list-item--print social-icons__list-item--standard thinner\">(.*?)</li>",'', article_only,re.MULTILINE) |
|
48 |
+ article_only = re.sub(r"<li class=\"social-icons__list-item social-icons__list-item--bookmark social-icons__list-item--standard thinner bookmark-disabled\">(.*?)</li>",'', article_only,re.MULTILINE) |
|
49 |
+ article_only = re.sub(r"<ul class=\"social-icons__list\">(.*?)</ul>",'', article_only,re.MULTILINE) |
|
50 |
+ article_only = re.sub(r"<aside class=\"sc(.*?)</aside>",'', article_only,re.MULTILINE) |
|
51 |
+ article_only = re.sub(r"<noscript>(.*?)</noscript>",'', article_only,re.MULTILINE) |
|
52 |
+ article_only = re.sub(r"<svg class=\"icon icon-print\" width=\"17\" height=\"16\" viewBox=\"0 0 17 16\" fill=\"none\" xmlns=\"http://www.w3.org/2000/svg\">",'', article_only) |
|
53 |
+ article_only = article_only.replace("><", ">\n<") |
|
54 |
+ |
|
55 |
+ pageContent += "<article>"+article_only+"</article>" |
|
56 |
+ return pageContent |
... | ... |
@@ -0,0 +1,44 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ url = url.replace(".html",".amp") |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ |
|
16 |
+ pageContent = "" |
|
17 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
23 |
+ |
|
24 |
+ articleCstBegin = "<header class=\"article__header\">" |
|
25 |
+ articleCstEnd = "<span class=\"article-comments__headline-title\">" |
|
26 |
+ articleCstEnd2 = "<div class=\"article-comments__comment-react\">" |
|
27 |
+ indexBegin = content.index(articleCstBegin) |
|
28 |
+ try: |
|
29 |
+ indexEnd = content.index(articleCstEnd) |
|
30 |
+ except: |
|
31 |
+ indexEnd = content.index(articleCstEnd2) |
|
32 |
+ |
|
33 |
+ article_only = content[indexBegin:indexEnd] |
|
34 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
35 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
36 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
37 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
38 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
39 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
40 |
+ article_only = article_only.replace("><", ">\n<") |
|
41 |
+ |
|
42 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.nouvelobs.com/', article_only) |
|
43 |
+ pageContent += "<article>"+article_only+"</article>" |
|
44 |
+ return pageContent |
... | ... |
@@ -0,0 +1,139 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import json |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ r = requests.get(url, allow_redirects=True) |
|
9 |
+ content = r.text |
|
10 |
+ |
|
11 |
+ #uuid extraction |
|
12 |
+ articleElementBegin ="name=\"cse_uuid\" content=\"" |
|
13 |
+ articleElementEnd ="\"/>" |
|
14 |
+ indexElementBegin = content.index(articleElementBegin) |
|
15 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
16 |
+ entityUUID = content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
17 |
+ |
|
18 |
+ cstJsonBegin = "window.__APOLLO_STATE__=" |
|
19 |
+ cstJsonEnd = "</script><script>" |
|
20 |
+ indexBegin = content.index(cstJsonBegin) |
|
21 |
+ indexBegin += len(cstJsonBegin) |
|
22 |
+ indexEnd = content.index(cstJsonEnd) |
|
23 |
+ raw_only = content[indexBegin:indexEnd] |
|
24 |
+ json_only = json.loads(raw_only) |
|
25 |
+ |
|
26 |
+ with open('data.json', 'w') as f: |
|
27 |
+ json.dump(json_only, f) |
|
28 |
+ |
|
29 |
+ applicationId = None |
|
30 |
+ json_article = None |
|
31 |
+ keyArticle = None |
|
32 |
+ for key in json_only["contentService"]["ROOT_QUERY"]: |
|
33 |
+ if "\"applicationId\":" in key: |
|
34 |
+ keySplit=key.split("\"") |
|
35 |
+ applicationId = keySplit[len(keySplit) - 2] |
|
36 |
+ keyArticle=json_only["contentService"]["ROOT_QUERY"][key]["id"] |
|
37 |
+ |
|
38 |
+ |
|
39 |
+ |
|
40 |
+ json_article=json_only["contentService"][keyArticle] |
|
41 |
+ articleStrTitle = json_article["socialHeadline"] |
|
42 |
+ articleStrDescription = "" |
|
43 |
+ |
|
44 |
+ for key in json_article["summary"]["json"]: |
|
45 |
+ htmlType=key["type"] |
|
46 |
+ htmlContent=key["children"][0]["data"] |
|
47 |
+ articleStrDescription+="<"+htmlType+">"+htmlContent+"</"+htmlType+">" |
|
48 |
+ |
|
49 |
+ pageContent = "" |
|
50 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">" |
|
51 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">" |
|
52 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">" |
|
53 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">" |
|
54 |
+ #pageContent += "<meta property=\"og:image\" content=\""+articleStrImage+"\">" |
|
55 |
+ pageContent += "<article>" |
|
56 |
+ pageContent += "<h2>"+articleStrTitle+"</h2>\n" |
|
57 |
+ pageContent += "<em>"+articleStrDescription+"</em>\n" |
|
58 |
+ |
|
59 |
+ # Article Extraction attempt |
|
60 |
+ keyArticle2="" |
|
61 |
+ for key in json_article: |
|
62 |
+ if "body({\"customContents\"" in key: |
|
63 |
+ keyArticle2=key |
|
64 |
+ |
|
65 |
+ #say("UUID :"+entityUUID) |
|
66 |
+ #say("AppID:"+applicationId) |
|
67 |
+ #say("Key :"+keyArticle) |
|
68 |
+ #say("Title:"+articleStrTitle) |
|
69 |
+ #say("Desc :"+articleStrDescription) |
|
70 |
+ json_article2 = json_article[keyArticle2] |
|
71 |
+ # ~ with open('data3.json', 'w') as f2: |
|
72 |
+ # ~ json.dump(json_article2, f2) |
|
73 |
+ |
|
74 |
+ cpt=0 |
|
75 |
+ for element in json_article2["json"]: |
|
76 |
+ htmlType=element["type"] |
|
77 |
+ # ~ print("Bef Element: "+htmlType) |
|
78 |
+ if "ad1" in htmlType: |
|
79 |
+ continue |
|
80 |
+ elif "ad2" in htmlType: |
|
81 |
+ continue |
|
82 |
+ elif "ad3" in htmlType: |
|
83 |
+ continue |
|
84 |
+ elif "ad4" in htmlType: |
|
85 |
+ continue |
|
86 |
+ elif "ad5" in htmlType: |
|
87 |
+ continue |
|
88 |
+ elif "native-ads" in htmlType: |
|
89 |
+ continue |
|
90 |
+ elif "more-on-this" in htmlType: |
|
91 |
+ continue |
|
92 |
+ # ~ print("Aft Element: "+htmlType) |
|
93 |
+ try: |
|
94 |
+ htmlContent = element["children"] |
|
95 |
+ except: |
|
96 |
+ continue |
|
97 |
+ pageContent += "<"+htmlType+">" |
|
98 |
+ |
|
99 |
+ for elementChildren in htmlContent: |
|
100 |
+ htmlTypeChildren=elementChildren["type"] |
|
101 |
+ if "text" == htmlTypeChildren: |
|
102 |
+ pageContent += elementChildren["data"] |
|
103 |
+ elif "a" == htmlTypeChildren: |
|
104 |
+ href=elementChildren["attribs"]["href"] |
|
105 |
+ pageContent += "<a href=\""+href+"\" target=\"new-"+str(cpt)+"\">" |
|
106 |
+ pageContent += elementChildren["children"][0]["data"] |
|
107 |
+ pageContent += "</"+htmlTypeChildren+">" |
|
108 |
+ elif "img" == htmlTypeChildren: |
|
109 |
+ src=elementChildren["attribs"]["src"] |
|
110 |
+ caption=elementChildren["attribs"]["title"] |
|
111 |
+ pageContent += "<img src=\""+src+"\">" |
|
112 |
+ pageContent += "<figcaption><em>"+caption+"</em></figcaption>" |
|
113 |
+ try: |
|
114 |
+ pageContent += elementChildren["children"][0]["data"] |
|
115 |
+ except: |
|
116 |
+ pass |
|
117 |
+ elif "iframe" == htmlTypeChildren: |
|
118 |
+ src=elementChildren["attribs"]["src"] |
|
119 |
+ caption=elementChildren["attribs"]["title"] |
|
120 |
+ pageContent += "<iframe src=\""+src+"\">" |
|
121 |
+ try: |
|
122 |
+ pageContent += elementChildren["children"][0]["data"] |
|
123 |
+ except: |
|
124 |
+ pass |
|
125 |
+ pageContent += "</iframe>" |
|
126 |
+ pageContent += "<figcaption><em><a href=\""+src+"\" target=\"new-"+str(cpt)+"\">"+caption+"</a></em></figcaption>" |
|
127 |
+ elif "em" == htmlTypeChildren: |
|
128 |
+ pageContent += "<"+htmlTypeChildren+">" |
|
129 |
+ try: |
|
130 |
+ pageContent += elementChildren["children"][0]["data"] |
|
131 |
+ except: |
|
132 |
+ pass |
|
133 |
+ pageContent += "</"+htmlTypeChildren+">" |
|
134 |
+ else: |
|
135 |
+ print("OTHER : "+htmlTypeChildren) |
|
136 |
+ pageContent += "</"+htmlType+">\n" |
|
137 |
+ cpt+=1 |
|
138 |
+ pageContent+="</article>" |
|
139 |
+ return pageContent |
... | ... |
@@ -0,0 +1,46 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
|
9 |
+ content = r.text |
|
10 |
+ pageContent = "" |
|
11 |
+ articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\"" |
|
12 |
+ articleCstEnd = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">" |
|
13 |
+ indexBegin = content.index(articleCstBegin) |
|
14 |
+ indexEnd = content.index(articleCstEnd) |
|
15 |
+ |
|
16 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
17 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
18 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
19 |
+ |
|
20 |
+ pageContent = "" |
|
21 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
23 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
24 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
25 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
26 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
27 |
+ |
|
28 |
+ article_only = "<h2>"+articleStrTitle+"</h2>\n" |
|
29 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
30 |
+ article_only += content[indexBegin:indexEnd] |
|
31 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
32 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
33 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
34 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
35 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
36 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
37 |
+ article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only) |
|
38 |
+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only) |
|
39 |
+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only) |
|
40 |
+ article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only) |
|
41 |
+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only) |
|
42 |
+ article_only = article_only.replace("><", ">\n<") |
|
43 |
+ |
|
44 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only) |
|
45 |
+ pageContent += "<article>"+article_only+"</article>" |
|
46 |
+ return pageContent |
... | ... |
@@ -0,0 +1,64 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ url = url.replace("www.midilibre.fr","www.midilibre.fr/amp") |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
12 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
13 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
14 |
+ |
|
15 |
+ pageContent = "" |
|
16 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
17 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
22 |
+ pageContent += "<h2>"+articleStrTitle+"</h2>\n" |
|
23 |
+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n" |
|
24 |
+ |
|
25 |
+ articleCstBegin = "<article " |
|
26 |
+ # ~ articleCstEnd = "</article>" |
|
27 |
+ articleCstEnd = "<div class=\"article-full__footer\">" |
|
28 |
+ articleCstEnd2 = "<section subscriptions-section=\"content-not-granted\">" |
|
29 |
+ articleCstEnd3 = "</article>" |
|
30 |
+ indexBegin = content.index(articleCstBegin) |
|
31 |
+ try: |
|
32 |
+ indexEnd = content.index(articleCstEnd) |
|
33 |
+ except: |
|
34 |
+ try: |
|
35 |
+ indexEnd = content.index(articleCstEnd2) |
|
36 |
+ except: |
|
37 |
+ indexEnd = content.index(articleCstEnd3) |
|
38 |
+ article_only = content[indexBegin:indexEnd] |
|
39 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
40 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
41 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
42 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
43 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
44 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
45 |
+ article_only = re.sub(r"<path class=\"icon-base-fill\"(.*?)</path>",'', article_only,re.MULTILINE) |
|
46 |
+ article_only = re.sub(r"<path class=\"icon-base-fill\"(.*?)\"/>",'', article_only) |
|
47 |
+ article_only = re.sub(r"onclick=\"window.open\("https:(.*?)\);\">",'', article_only) |
|
48 |
+ article_only = re.sub(r"<span class=\"text\">S'abonner</span>",'', article_only) |
|
49 |
+ article_only = re.sub(r"<div id=\"pub_dfp_inread1\" class=\"pub pub_dfp pub_dfp_inread1 upto-tablet base-margin-bottom pub_with_light_background\"></div>",'', article_only,re.MULTILINE) |
|
50 |
+ # ~ article_only = re.sub(r"<svg class=\"icon-share\"(.*?)</svg>",'', article_only,re.MULTILINE) |
|
51 |
+ article_only = re.sub(r"href=\"mailto:\?subject=(.*?)\"",'', article_only,re.MULTILINE) |
|
52 |
+ article_only = re.sub(r"<svg class=\"(.*?)\" viewBox=\"0 0 (.*?) (.*?)\">",'<svg>', article_only) |
|
53 |
+ article_only = re.sub(r"<svg viewBox=\"0 0 (.*?) (.*?)\">",'<svg>', article_only) |
|
54 |
+ article_only = re.sub(r"class=\"btn btn-icon btn-icon btn-disc btn-pill btn-outline\"",'', article_only) |
|
55 |
+ article_only = re.sub(r"class=\"btn btn-icon btn-icon btn-disc btn-pill btn-outline upto-tablet\"",'', article_only) |
|
56 |
+ article_only = re.sub(r"<button",'<button style="display:none;">', article_only) |
|
57 |
+ article_only = re.sub(r"<aside class=\"social-links",'<aside style="display:none;" class="social-links', article_only) |
|
58 |
+ article_only = re.sub(r"onclick=\"if\(navigator\.share\) (.*?)return false;\" >",'', article_only) |
|
59 |
+ |
|
60 |
+ # ~ article_only = article_only.replace("><", ">\n<") |
|
61 |
+ |
|
62 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.sudouest.fr/', article_only) |
|
63 |
+ pageContent += "<article>"+article_only+"</article>" |
|
64 |
+ return pageContent |
... | ... |
@@ -0,0 +1,69 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleImage(content): |
|
7 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
8 |
+ articleImgEnd ="\">" |
|
9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ return image |
|
13 |
+ |
|
14 |
+def articleTitle(content): |
|
15 |
+ articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
16 |
+ articleImgEnd ="\">" |
|
17 |
+ indexImgBegin = content.index(articleImgBegin) |
|
18 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
19 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
20 |
+ return title |
|
21 |
+ |
|
22 |
+def articleAbonnes(content): |
|
23 |
+ articleAbonnes = "réservé aux abonnés" |
|
24 |
+ articleType = "" |
|
25 |
+ try: |
|
26 |
+ indexAbonnes = content.index(articleAbonnes) |
|
27 |
+ articleType = "Abonnés" |
|
28 |
+ except: |
|
29 |
+ articleType = "" |
|
30 |
+ return articleType |
|
31 |
+ |
|
32 |
+def article(url): |
|
33 |
+ say("Article: "+url) |
|
34 |
+ r = requests.get(url, allow_redirects=True) |
|
35 |
+ content = r.text |
|
36 |
+ articleStrImageUrl = articleImage(content) |
|
37 |
+ articleStrTitle = articleTitle(content) |
|
38 |
+ articleStrType = articleAbonnes(content) |
|
39 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
40 |
+ |
|
41 |
+ pageContent = "" |
|
42 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
43 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
44 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
45 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
46 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
47 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
48 |
+ pageContent += "<h2>"+articleStrTitle+"</h2>\n" |
|
49 |
+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n" |
|
50 |
+ |
|
51 |
+ articleCstBegin = "<article " |
|
52 |
+ if articleStrType is "": |
|
53 |
+ articleCstEnd = "</article>" |
|
54 |
+ else: |
|
55 |
+ articleCstEnd = "Cet article est réservé aux abonnés" |
|
56 |
+ indexBegin = content.index(articleCstBegin) |
|
57 |
+ indexEnd = content.index(articleCstEnd) |
|
58 |
+ article_only = content[indexBegin:indexEnd] |
|
59 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
60 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
61 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
62 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
63 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
64 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
65 |
+ |
|
66 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//wwww.telerama.fr/', article_only) |
|
67 |
+ pageContent += "<article>"+article_only+"</article>" |
|
68 |
+ pageContent += "<p>"+articleStrType+"</p>" |
|
69 |
+ return pageContent |
... | ... |
@@ -0,0 +1,43 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ url = url.replace("www.midilibre.fr","www.midilibre.fr/amp") |
|
9 |
+ r = requests.get(url, allow_redirects=True) |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ |
|
16 |
+ pageContent = "" |
|
17 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
23 |
+ |
|
24 |
+ articleCstBegin = "<article " |
|
25 |
+ articleCstEnd = "</article>" |
|
26 |
+ indexBegin = content.index(articleCstBegin) |
|
27 |
+ indexEnd = content.index(articleCstEnd,indexBegin) |
|
28 |
+ article_only = content[indexBegin:indexEnd] |
|
29 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
30 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
31 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
32 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
33 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
34 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
35 |
+ article_only = re.sub(r"<gtp-ad(.*?)></gpt>",'',article_only) |
|
36 |
+ article_only = article_only.replace("><", ">\n<") |
|
37 |
+ #<ul class="ArticleRecirc_list__3WyEw"> |
|
38 |
+ article_only = re.sub(r"<h3 class=\"ArticleRecirc_heading__(.*?)\">Recommended Reading</h3>",'',article_only) |
|
39 |
+ article_only = re.sub(r"<ul class=\"ArticleRecirc_list__(.*?)\">", '<ul style="display: none;">', article_only,re.MULTILINE) |
|
40 |
+ article_only = re.sub(r"<button class=\"ArticleShare_shareButton__(.*?)\" aria-haspopup=\"true\" aria-controls=\"expanded-share-kit\" aria-expanded=\"false\" aria-label=\"Open Share Menu\" data-action=\"click share - expand\">Share</button>", '', article_only) |
|
41 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.theatlantic.com/', article_only) |
|
42 |
+ pageContent += "<article>"+article_only+"</article>" |
|
43 |
+ return pageContent |
... | ... |
@@ -0,0 +1,74 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleImage(content): |
|
7 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
8 |
+ articleImgEnd ="\"/>" |
|
9 |
+ # ~ articleImgEnd ="\?width=" |
|
10 |
+ indexImgBegin = content.index(articleImgBegin) |
|
11 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
12 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
13 |
+ return image |
|
14 |
+ |
|
15 |
+def articleTitle(content): |
|
16 |
+ articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
17 |
+ articleImgEnd ="\"/>" |
|
18 |
+ indexImgBegin = content.index(articleImgBegin) |
|
19 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
20 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
21 |
+ return title |
|
22 |
+ |
|
23 |
+def articleDescription(content): |
|
24 |
+ articleImgBegin ="<meta property=\"og:description\" content=\"" |
|
25 |
+ articleImgEnd ="\"/>" |
|
26 |
+ indexImgBegin = content.index(articleImgBegin) |
|
27 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
28 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
29 |
+ return title |
|
30 |
+ |
|
31 |
+ |
|
32 |
+def article(url): |
|
33 |
+ say("Article: "+url) |
|
34 |
+ # ~ url = url.replace("www.theguardian.com","amp.theguardian.com") |
|
35 |
+ r = requests.get(url, allow_redirects=True) |
|
36 |
+ content = r.text |
|
37 |
+ |
|
38 |
+ articleCstBegin = "<div class=\"article-body-commercial-selector" |
|
39 |
+ articleCstEnd = "<div id=\"slot-body-end\">" |
|
40 |
+ indexBegin = content.index(articleCstBegin) |
|
41 |
+ indexEnd = content.index(articleCstEnd) |
|
42 |
+ articleStrImageUrl = articleImage(content) |
|
43 |
+ articleStrTitle = articleTitle(content) |
|
44 |
+ articleStrDescription = articleDescription(content) |
|
45 |
+ |
|
46 |
+ pageContent = "" |
|
47 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
48 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
49 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
50 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
51 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
52 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
53 |
+ |
|
54 |
+ article_only = "" |
|
55 |
+ article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
56 |
+ article_only += "<em>"+articleStrDescription+"</em>\n" |
|
57 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
58 |
+ article_only += content[indexBegin:indexEnd] |
|
59 |
+ article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only) |
|
60 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
61 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
62 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
63 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
64 |
+ article_only = re.sub(r"<p>Advertisement</p>", '', article_only) |
|
65 |
+ # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only) |
|
66 |
+ # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only) |
|
67 |
+ article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only) |
|
68 |
+ article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only) |
|
69 |
+ # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only) |
|
70 |
+ article_only = article_only.replace("><", ">\n<") |
|
71 |
+ |
|
72 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only) |
|
73 |
+ pageContent += "<article>"+article_only+"</article>" |
|
74 |
+ return pageContent |
... | ... |
@@ -0,0 +1,63 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleImage(content): |
|
7 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
8 |
+ articleImgEnd ="\" />" |
|
9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ return image |
|
13 |
+ |
|
14 |
+def articleTitle(content): |
|
15 |
+ articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
16 |
+ articleImgEnd ="\" />" |
|
17 |
+ indexImgBegin = content.index(articleImgBegin) |
|
18 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
19 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
20 |
+ return title |
|
21 |
+ |
|
22 |
+def article(url): |
|
23 |
+ say("Article: "+url) |
|
24 |
+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
|
25 |
+ content = r.text |
|
26 |
+ pageContent = "" |
|
27 |
+ articleCstBegin = "<article" |
|
28 |
+ # ~ articleCstEnd = "</article>" |
|
29 |
+ articleCstEnd = "<!-- /Pagination -->" |
|
30 |
+ indexBegin = content.index(articleCstBegin) |
|
31 |
+ indexEnd = content.index(articleCstEnd) |
|
32 |
+ |
|
33 |
+ articleStrImageUrl = articleImage(content) |
|
34 |
+ articleStrTitle = articleTitle(content) |
|
35 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
36 |
+ |
|
37 |
+ pageContent = "" |
|
38 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
39 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
40 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
41 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
42 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
43 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
44 |
+ |
|
45 |
+ article_only = "<h2>"+articleStrTitle+"</h2>\n" |
|
46 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
47 |
+ article_only += content[indexBegin:indexEnd] |
|
48 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
49 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
50 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
51 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
52 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
53 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
54 |
+ article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only) |
|
55 |
+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only) |
|
56 |
+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only) |
|
57 |
+ article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only) |
|
58 |
+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only) |
|
59 |
+ article_only = article_only.replace("><", ">\n<") |
|
60 |
+ |
|
61 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only) |
|
62 |
+ pageContent += "<article>"+article_only+"</article>" |
|
63 |
+ return pageContent |
... | ... |
@@ -0,0 +1,81 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleImage(content): |
|
7 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
8 |
+ articleImgEnd ="\" />" |
|
9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ return image |
|
13 |
+ |
|
14 |
+def articleTitle(content): |
|
15 |
+ articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
16 |
+ articleImgEnd ="\" />" |
|
17 |
+ indexImgBegin = content.index(articleImgBegin) |
|
18 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
19 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
20 |
+ return title |
|
21 |
+ |
|
22 |
+def articleDescription(content): |
|
23 |
+ articleImgBegin ="<meta property=\"og:description\" content=\"" |
|
24 |
+ articleImgEnd ="\" />" |
|
25 |
+ indexImgBegin = content.index(articleImgBegin) |
|
26 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
27 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
28 |
+ return title |
|
29 |
+ |
|
30 |
+ |
|
31 |
+def article(url): |
|
32 |
+ say("Article: "+url) |
|
33 |
+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
|
34 |
+ content = r.text |
|
35 |
+ |
|
36 |
+ articleCstBegin = "<div class=\"c-entry-content \">" |
|
37 |
+ articleCstBegin2 = "<article" |
|
38 |
+ articleCstEnd = "<div class=\"u-hidden-text\" id=\"formatter-datter\"" |
|
39 |
+ # ~ articleCstEnd = "<section class=\"c-nextclick\">" |
|
40 |
+ articleCstEnd2 = "<section class=\"c-related-list\">" |
|
41 |
+ articleCstEnd3 = "</article" |
|
42 |
+ |
|
43 |
+ articleStrImageUrl = articleImage(content) |
|
44 |
+ articleStrTitle = articleTitle(content) |
|
45 |
+ articleStrDescription = articleDescription(content) |
|
46 |
+ |
|
47 |
+ pageContent = "" |
|
48 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
49 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
50 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
51 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
52 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
53 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
54 |
+ |
|
55 |
+ try: |
|
56 |
+ indexBegin = content.index(articleCstBegin) |
|
57 |
+ except: |
|
58 |
+ indexBegin = content.index(articleCstBegin2) |
|
59 |
+ try: |
|
60 |
+ indexEnd = content.index(articleCstEnd) |
|
61 |
+ except: |
|
62 |
+ try: |
|
63 |
+ indexEnd = content.index(articleCstEnd2) |
|
64 |
+ except: |
|
65 |
+ indexEnd = content.index(articleCstEnd3) |
|
66 |
+ article_only = "" |
|
67 |
+ article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
68 |
+ article_only += "<em>"+articleStrDescription+"</em>\n" |
|
69 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
70 |
+ article_only += content[indexBegin:indexEnd] |
|
71 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
72 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
73 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
74 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
75 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
76 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
77 |
+ |
|
78 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.theverge.com/', article_only) |
|
79 |
+ article_only = re.sub(r"src=\"\/", 'src=\"//www.theverge.com/', article_only) |
|
80 |
+ pageContent += "<article>"+article_only+"</article>" |
|
81 |
+ return pageContent |
... | ... |
@@ -0,0 +1,67 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleImage(content): |
|
7 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
8 |
+ articleImgEnd ="\"/>" |
|
9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ return image |
|
13 |
+ |
|
14 |
+def articleTitle(content): |
|
15 |
+ articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
16 |
+ articleImgEnd ="\"/>" |
|
17 |
+ indexImgBegin = content.index(articleImgBegin) |
|
18 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
19 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
20 |
+ return title |
|
21 |
+ |
|
22 |
+ |
|
23 |
+def article(url): |
|
24 |
+ say("Article: "+url) |
|
25 |
+ r = requests.get(url, allow_redirects=True) |
|
26 |
+ content = r.text |
|
27 |
+ pageContent = "" |
|
28 |
+ articleCstBegin = "<div class=\"short-form__body\">" |
|
29 |
+ articleCstBegin2 = "<div class=\"article__longform__content\">" |
|
30 |
+ articleCstEnd = "<div class=\"article__tagged\">" |
|
31 |
+ articleCstEnd2 = "<div class=\"article__longform__tags\">" |
|
32 |
+ try: |
|
33 |
+ indexBegin = content.index(articleCstBegin) |
|
34 |
+ except: |
|
35 |
+ indexBegin = content.index(articleCstBegin2) |
|
36 |
+ |
|
37 |
+ try: |
|
38 |
+ indexEnd = content.index(articleCstEnd) |
|
39 |
+ except: |
|
40 |
+ indexEnd = content.index(articleCstEnd2) |
|
41 |
+ |
|
42 |
+ # ~ indexEnd = content.index(articleCstEnd) |
|
43 |
+ articleStrImageUrl = articleImage(content) |
|
44 |
+ articleStrTitle = articleTitle(content) |
|
45 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
46 |
+ |
|
47 |
+ pageContent = "" |
|
48 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
49 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
50 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
51 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
52 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
53 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
54 |
+ |
|
55 |
+ article_only = "<h2>"+articleStrTitle+"</h2>\n" |
|
56 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
57 |
+ article_only += content[indexBegin:indexEnd] |
|
58 |
+ article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only) |
|
59 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
60 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
61 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
62 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
63 |
+ article_only = article_only.replace("><", ">\n<") |
|
64 |
+ |
|
65 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.vice.com/', article_only) |
|
66 |
+ pageContent += "<article>"+article_only+"</article>" |
|
67 |
+ return pageContent |
... | ... |
@@ -0,0 +1,68 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ r = requests.get(url, allow_redirects=True) |
|
9 |
+ r.encoding = r.apparent_encoding |
|
10 |
+ content = r.text |
|
11 |
+ |
|
12 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
14 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
15 |
+ articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl) |
|
16 |
+ |
|
17 |
+ pageContent = "" |
|
18 |
+ pageContent += "<meta charset=\"utf-8\"/>" |
|
19 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
22 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
23 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
24 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
25 |
+ |
|
26 |
+ articleCstBegin = "<article" |
|
27 |
+ articleCstEnd = "<div class=\"mt-md\">" |
|
28 |
+ articleCstEnd2 = "</article>" |
|
29 |
+ indexBegin = content.index(articleCstBegin) |
|
30 |
+ try: |
|
31 |
+ indexEnd = content.index(articleCstEnd) |
|
32 |
+ except: |
|
33 |
+ indexEnd = content.index(articleCstEnd2) |
|
34 |
+ |
|
35 |
+ |
|
36 |
+ article_only = "<h2>"+articleStrTitle+"</h2>" |
|
37 |
+ article_only = "<img src=\""+articleStrImageUrl+"\">" |
|
38 |
+ |
|
39 |
+ article_only += content[indexBegin:indexEnd] |
|
40 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
41 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
42 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
43 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
44 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
45 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
46 |
+ # ~ article_only = re.sub(r"<div data-sc-v=\"4.23.4\" data-sc-c=\"placeholder\">Advertisement</div>", '</h2>', article_only) |
|
47 |
+ article_only = re.sub(r"<div data-sc-v=\"4\.24\.3\" data-sc-c=\"placeholder\">Advertisement</div>", '', article_only) |
|
48 |
+ article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only) |
|
49 |
+ article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only) |
|
50 |
+ article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only) |
|
51 |
+ article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only) |
|
52 |
+ article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only) |
|
53 |
+ article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only) |
|
54 |
+ article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only) |
|
55 |
+ article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only) |
|
56 |
+ article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only) |
|
57 |
+ article_only = re.sub(r"<div data-qa=\"drop-cap-letter\">", '<div>', article_only) |
|
58 |
+ article_only = re.sub(r"filter:blur\(10px\);", '', article_only) |
|
59 |
+ article_only = re.sub(r"<div class=\"bg-pattern-1\".+?>", '<div>', article_only) |
|
60 |
+ article_only = re.sub(r"<div class=\"bg-pattern-2\".+?>", '<div>', article_only) |
|
61 |
+ article_only = re.sub(r"<img class=\"dn canvas-foreground\" src=\".+?\"/>", '', article_only) |
|
62 |
+ article_only = re.sub(r"<div class=\"subhead .+?>", '<div>', article_only) |
|
63 |
+ #article_only = re.sub(r"<canvas id=\"artboard\" style=\".+\">", '<canvas>', article_only) |
|
64 |
+ #article_only = re.sub(r"", '', article_only) |
|
65 |
+ article_only = article_only.replace("><", ">\n<") |
|
66 |
+ |
|
67 |
+ pageContent += "<article>"+article_only+"</article>" |
|
68 |
+ return pageContent |
... | ... |
@@ -0,0 +1,66 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def articleTitle(content): |
|
7 |
+ articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
8 |
+ articleImgEnd ="\" />" |
|
9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
11 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ return title |
|
13 |
+ |
|
14 |
+def articleImage(content): |
|
15 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
16 |
+ articleImgEnd ="\" />" |
|
17 |
+ indexImgBegin = content.index(articleImgBegin) |
|
18 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
19 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
20 |
+ return title |
|
21 |
+ |
|
22 |
+ |
|
23 |
+def articleDescription(content): |
|
24 |
+ articleImgBegin ="<meta property=\"og:description\" content=\"" |
|
25 |
+ articleImgEnd ="\" />" |
|
26 |
+ indexImgBegin = content.index(articleImgBegin) |
|
27 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin) |
|
28 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
29 |
+ return title |
|
30 |
+ |
|
31 |
+def article(url): |
|
32 |
+ say("Article: "+url) |
|
33 |
+ url = url.replace("dna.fr/","dna.fr/amp/") |
|
34 |
+ r = requests.get(url, allow_redirects=True) |
|
35 |
+ content = r.text |
|
36 |
+ articleCstBegin = "<div class=\"caas-body\">" |
|
37 |
+ articleCstEnd = "</article>" |
|
38 |
+ articleStrTitle = articleTitle(content) |
|
39 |
+ articleStrImageUrl = articleImage(content) |
|
40 |
+ articleStrDescription = articleDescription(content) |
|
41 |
+ |
|
42 |
+ pageContent = "" |
|
43 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
44 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
45 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
46 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
47 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
48 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
49 |
+ |
|
50 |
+ pageContent += "<h2>"+articleStrTitle+"</h2>\n" |
|
51 |
+ pageContent += "<em>"+articleStrDescription+"</em>\n" |
|
52 |
+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n" |
|
53 |
+ |
|
54 |
+ indexBegin = content.index(articleCstBegin) |
|
55 |
+ indexEnd = content.index(articleCstEnd) |
|
56 |
+ article_only = "" |
|
57 |
+ article_only = content[indexBegin:indexEnd] |
|
58 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
59 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
60 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
61 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
62 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
63 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
64 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//news.yahoo.com/', article_only) |
|
65 |
+ pageContent += "<article>"+article_only+"</article>" |
|
66 |
+ return pageContent |
... | ... |
@@ -0,0 +1,38 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+ |
|
6 |
+def article(url): |
|
7 |
+ say("Article: "+url) |
|
8 |
+ r = requests.get(url, allow_redirects=True) |
|
9 |
+ content = r.text |
|
10 |
+ |
|
11 |
+ articleStrImageUrl = newsParser.articleImage(content) |
|
12 |
+ articleStrTitle = newsParser.articleTitle(content) |
|
13 |
+ articleStrDescription = newsParser.articleDescription(content) |
|
14 |
+ |
|
15 |
+ pageContent = "" |
|
16 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
17 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
18 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
19 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
20 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
21 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
22 |
+ |
|
23 |
+ articleCstBegin = "<article " |
|
24 |
+ articleCstEnd = "</article>" |
|
25 |
+ indexBegin = content.index(articleCstBegin) |
|
26 |
+ indexEnd = content.index(articleCstEnd) |
|
27 |
+ article_only = content[indexBegin:indexEnd] |
|
28 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
29 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
30 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
31 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
32 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
33 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
34 |
+ article_only = article_only.replace("><", ">\n<") |
|
35 |
+ |
|
36 |
+ article_only = re.sub(r"href=\"\/", 'href=\"//www.zdnet.fr/', article_only) |
|
37 |
+ pageContent += "<article>"+article_only+"</article>" |
|
38 |
+ return pageContent |