... | ... |
@@ -12,6 +12,7 @@ from .newsParser import newsBondyBlog |
12 | 12 |
from .newsParser import newsBuzzfeedCom |
13 | 13 |
from .newsParser import newsChallengesFr |
14 | 14 |
from .newsParser import newsCNA |
15 |
+from .newsParser import newsCourrier |
|
15 | 16 |
from .newsParser import newsDefault |
16 | 17 |
from .newsParser import newsDNA |
17 | 18 |
from .newsParser import newsFranceTVInfo |
... | ... |
@@ -27,7 +28,6 @@ from .newsParser import newsMediapart |
27 | 28 |
from .newsParser import newsMidiLibre |
28 | 29 |
from .newsParser import newsMothershipSG |
29 | 30 |
from .newsParser import newsNewYorker |
30 |
-from .newsParser import newsNewYorkTimes |
|
31 | 31 |
from .newsParser import newsNouvelObs |
32 | 32 |
from .newsParser import newsNSTMy |
33 | 33 |
from .newsParser import newsSCMP |
... | ... |
@@ -46,6 +46,7 @@ from .newsParser import newsYahooCom |
46 | 46 |
from .newsParser import newsZDNetFr |
47 | 47 |
# ~ from .newsParser import newsXXXXXX |
48 | 48 |
from .newsParser import accountMediapart |
49 |
+from .newsParser import accountCourrier |
|
49 | 50 |
|
50 | 51 |
def supportedList(): |
51 | 52 |
current_module = __import__(__name__) |
... | ... |
@@ -213,6 +214,8 @@ def getArticle(url): |
213 | 214 |
data_page += newsParser.newsSlateFr.article(url) |
214 | 215 |
elif "mediapart.fr" in url: |
215 | 216 |
data_page += newsParser.newsMediapart.article(url) |
217 |
+ elif "courrierinternational.com" in url: |
|
218 |
+ data_page += newsParser.newsCourrier.article(url) |
|
216 | 219 |
else: |
217 | 220 |
data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n" |
218 | 221 |
#data_page += "<p>Supported News:" |
... | ... |
@@ -0,0 +1,98 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+from requests_html import HTML |
|
6 |
+from requests_html import HTMLSession |
|
7 |
+from bs4 import BeautifulSoup |
|
8 |
+ |
|
9 |
+def article(url): |
|
10 |
+ say("Article: "+url) |
|
11 |
+ pageContent="" |
|
12 |
+ article_only="" |
|
13 |
+ |
|
14 |
+ htmlContent=newsParser.accountCourrier.getArticle(url) |
|
15 |
+ #htmlContent="" |
|
16 |
+ #with open("toto.html") as f: |
|
17 |
+ # htmlContent=f.read() |
|
18 |
+ |
|
19 |
+ articleStrTitle = "" |
|
20 |
+ articleStrDescription = "" |
|
21 |
+ articleStrImageUrl = "" |
|
22 |
+ articleStrAuthor = "" |
|
23 |
+ try: |
|
24 |
+ articleStrTitle = re.search("<meta property=\"og:title\" content=\"(.+?)\" \/>",htmlContent).group(1) |
|
25 |
+ except: |
|
26 |
+ pass |
|
27 |
+ try: |
|
28 |
+ articleStrDescription = re.search("<meta property=\"og:description\" content=\"(.+?)\" \/>",htmlContent).group(1) |
|
29 |
+ except: |
|
30 |
+ pass |
|
31 |
+ try: |
|
32 |
+ articleStrImageUrl = re.search("<meta property=\"og:image\" content=\"(.+?)\" \/>",htmlContent).group(1) |
|
33 |
+ except: |
|
34 |
+ pass |
|
35 |
+ try: |
|
36 |
+ articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1) |
|
37 |
+ except: |
|
38 |
+ pass |
|
39 |
+ with open("toto.html","w") as f: |
|
40 |
+ f.write(htmlContent) |
|
41 |
+ print("Title:"+articleStrTitle) |
|
42 |
+ |
|
43 |
+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n" |
|
44 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n" |
|
45 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n" |
|
46 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n" |
|
47 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n" |
|
48 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n" |
|
49 |
+ pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n" |
|
50 |
+ articleElementBegin="<article" |
|
51 |
+ #articleElementEnd ="<div class=\"news__aside__feedback\">" |
|
52 |
+ articleElementEnd ="</article>" |
|
53 |
+ print("Start: "+articleElementBegin) |
|
54 |
+ print("End : "+articleElementEnd) |
|
55 |
+ indexElementBegin = htmlContent.index(articleElementBegin) |
|
56 |
+ indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin)+len(articleElementEnd) |
|
57 |
+ article_only = htmlContent[indexElementBegin:indexElementEnd] |
|
58 |
+ lenBefore=len(article_only) |
|
59 |
+ say("LengthBefore: "+str(lenBefore)) |
|
60 |
+ |
|
61 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
62 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
63 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
64 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
65 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
66 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
67 |
+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S) |
|
68 |
+ article_only = re.sub(r'<script(.+?)/>','',article_only) |
|
69 |
+ article_only = re.sub(r'<button(.+?)</button>','',article_only,flags=re.M|re.S) |
|
70 |
+ article_only = re.sub(r' <div role="tooltip" id="disabled-pdf-notification">(.+?)</div>','',article_only,flags=re.M|re.S) |
|
71 |
+ article_only = re.sub(r'<svg class="icon"(.+?)</svg>','',article_only,flags=re.M|re.S) |
|
72 |
+ article_only = re.sub(r'<span class="is-visually-hidden">Partager(.+?)</span>','',article_only,flags=re.M|re.S) |
|
73 |
+ #article_only = re.sub(r'<a href=\"(.+?)data-smarttag-name="partage_(.+?)"(.+?)data-smarttag-type="action">(.+?)</a>','AAAAA ',article_only,flags=re.M|re.S) |
|
74 |
+ article_only = re.sub(r'<span>Offrir<span class="is-hidden-until-md"> l’article</span>','',article_only) |
|
75 |
+ #article_only = re.sub(r'','',article_only) |
|
76 |
+ article_only = re.sub(r"href=\"/",'href="https://mediapart.fr/',article_only) |
|
77 |
+ article_only = re.sub(r"src=\"/",'src="https://mediapart.fr/',article_only) |
|
78 |
+ article_only = re.sub(r"^$",'',article_only) |
|
79 |
+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S) |
|
80 |
+ article_only = re.sub(r"><",'>\n<',article_only) |
|
81 |
+ |
|
82 |
+ lenAfter=len(article_only) |
|
83 |
+ lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
84 |
+ say("LengthAfter : "+str(lenAfter)) |
|
85 |
+ say("Gain : "+str(lenGain)+"%") |
|
86 |
+ #pageContent += "<article>"+article_only+"</article>" |
|
87 |
+ pageContent += article_only |
|
88 |
+ return pageContent |
|
89 |
+ |
|
90 |
+ |
|
91 |
+ #pageContent += "\n"+article_only+"\n" |
|
92 |
+ pageContent += "<article>\n"+article_only+"\n</article>\n" |
|
93 |
+ lenAfter=len(article_only) |
|
94 |
+ #lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
95 |
+ say("LengthAfter : "+str(lenAfter)) |
|
96 |
+ #say("Gain : "+str(lenGain)+"%") |
|
97 |
+ return pageContent |
|
98 |
+ |
... | ... |
@@ -5,9 +5,9 @@ import re |
5 | 5 |
def articleImage(content): |
6 | 6 |
articleImgBegin ="<meta property=\"og:image\" content=\"" |
7 | 7 |
articleImgEnd ="\"" |
8 |
- indexImgBegin = content.index(articleImgBegin) |
|
9 |
- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
10 | 8 |
try: |
9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
11 | 11 |
image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
12 | 12 |
except: |
13 | 13 |
image = "favicon.png" |
... | ... |
@@ -16,21 +16,31 @@ def articleImage(content): |
16 | 16 |
def articleDescription(content): |
17 | 17 |
articleImgBegin ="<meta property=\"og:description\" content=\"" |
18 | 18 |
articleImgEnd ="\"" |
19 |
- indexImgBegin = content.index(articleImgBegin) |
|
20 |
- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
21 | 19 |
try: |
20 |
+ indexImgBegin = content.index(articleImgBegin) |
|
21 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
22 | 22 |
title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
23 | 23 |
except: |
24 | 24 |
title = "Description Extraction Failed" |
25 | 25 |
return title |
26 | 26 |
|
27 |
+ |
|
28 |
+def checkBlock(content): |
|
29 |
+ blockString="Attention Required! | Cloudflare" |
|
30 |
+ try: |
|
31 |
+ indexBlock = content.index(blockString) |
|
32 |
+ except: |
|
33 |
+ indexBlock = None |
|
34 |
+ say("indexBlock: "+str(indexBlock)) |
|
35 |
+ return indexBlock |
|
36 |
+ |
|
27 | 37 |
def articleTitle(content): |
28 | 38 |
#articleImgBegin ="<meta property=\"og:title\" content=\"" |
29 | 39 |
articleImgBegin ="\"og:title\" content=\"" |
30 | 40 |
articleImgEnd ="\"" |
31 |
- indexImgBegin = content.index(articleImgBegin) |
|
32 |
- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
33 | 41 |
try: |
42 |
+ indexImgBegin = content.index(articleImgBegin) |
|
43 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
34 | 44 |
title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
35 | 45 |
except: |
36 | 46 |
title = "Title Extraction Failed" |
... | ... |
@@ -41,17 +51,26 @@ def article(url): |
41 | 51 |
r = requests.get(url, allow_redirects=True) |
42 | 52 |
content = r.text |
43 | 53 |
|
44 |
- articleStrImageUrl = articleImage(content) |
|
45 |
- articleStrTitle = articleTitle(content) |
|
46 |
- articleStrDescription = articleDescription(content) |
|
54 |
+ with open("test.html","w") as f: |
|
55 |
+ f.write(content) |
|
56 |
+ |
|
57 |
+ if checkBlock(content) == None: |
|
58 |
+ articleStrImageUrl = None |
|
59 |
+ articleStrTitle = "No Title" |
|
60 |
+ articleStrDescription = "No Description" |
|
61 |
+ else: |
|
62 |
+ articleStrImageUrl = articleImage(content) |
|
63 |
+ articleStrTitle = articleTitle(content) |
|
64 |
+ articleStrDescription = articleDescription(content) |
|
47 | 65 |
|
48 | 66 |
pageContent = "" |
49 | 67 |
pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
50 | 68 |
pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
51 | 69 |
pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
52 | 70 |
pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
53 |
- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
54 |
- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
71 |
+ if None is not articleStrImageUrl: |
|
72 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
73 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
55 | 74 |
|
56 | 75 |
articleCstBegin = "<article" |
57 | 76 |
articleCstEnd = "</article>" |
... | ... |
@@ -14,9 +14,11 @@ def article(url): |
14 | 14 |
article_only="" |
15 | 15 |
|
16 | 16 |
htmlContent=newsParser.accountMediapart.getArticle(url) |
17 |
- #htmlContent="" |
|
18 |
- #with open("toto.html") as f: |
|
17 |
+ #with open("toto.html","w") as f: |
|
18 |
+ # f.write(htmlContent) |
|
19 |
+ #with open("toto.html","r") as f: |
|
19 | 20 |
# htmlContent=f.read() |
21 |
+ #print(len(htmlContent)) |
|
20 | 22 |
|
21 | 23 |
articleStrTitle = "" |
22 | 24 |
articleStrDescription = "" |
... | ... |
@@ -38,8 +40,6 @@ def article(url): |
38 | 40 |
articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1) |
39 | 41 |
except: |
40 | 42 |
pass |
41 |
- #with open("toto.html","w") as f: |
|
42 |
- # f.write(htmlContent) |
|
43 | 43 |
|
44 | 44 |
pageContent += "<meta property=\"og:type\" content=\"article\" />\n" |
45 | 45 |
pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n" |
... | ... |
@@ -49,10 +49,19 @@ def article(url): |
49 | 49 |
pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n" |
50 | 50 |
pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n" |
51 | 51 |
articleElementBegin="<div class=\"news__heading__top" |
52 |
- #articleElementEnd ="<div class=\"news__aside__feedback\">" |
|
53 | 52 |
articleElementEnd ="<hr class=\"divider-horizontal" |
53 |
+ articleElementEnd2 ="<div class=\"news__aside__feedback\">" |
|
54 |
+ articleElementEnd3 ="</body>" |
|
54 | 55 |
indexElementBegin = htmlContent.index(articleElementBegin) |
55 |
- indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin) |
|
56 |
+ try: |
|
57 |
+ indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin) |
|
58 |
+ except: |
|
59 |
+ try: |
|
60 |
+ indexElementEnd = htmlContent.index(articleElementEnd2,indexElementBegin) |
|
61 |
+ except: |
|
62 |
+ indexElementEnd = htmlContent.index(articleElementEnd3,indexElementBegin) |
|
63 |
+ |
|
64 |
+ #print(indexElementEnd) |
|
56 | 65 |
article_only = "<div>"+htmlContent[indexElementBegin:indexElementEnd] |
57 | 66 |
lenBefore=len(article_only) |
58 | 67 |
say("LengthBefore: "+str(lenBefore)) |
... | ... |
@@ -0,0 +1,60 @@ |
1 |
+# ~ from userio import * |
|
2 |
+import userio |
|
3 |
+import requests |
|
4 |
+import re |
|
5 |
+import newsParser |
|
6 |
+from requests_html import HTML |
|
7 |
+from requests_html import HTMLSession |
|
8 |
+ |
|
9 |
+ |
|
10 |
+def article(url): |
|
11 |
+ say("Article: "+url) |
|
12 |
+ session = HTMLSession() |
|
13 |
+ response = session.get(url,timeout=20) |
|
14 |
+ pageContent="" |
|
15 |
+ article_only="" |
|
16 |
+ with response as r: |
|
17 |
+ articleStrTitle = r.html.xpath('//meta[@property="og:title"]/@content')[0] |
|
18 |
+ articleStrDescription = r.html.xpath('//meta[@property="og:description"]/@content')[0] |
|
19 |
+ articleStrImageUrl = r.html.xpath('//meta[@property="og:image"]/@content')[0] |
|
20 |
+ articleStrAuthor = r.html.xpath('//div[@class="author_wrapper"]/@content') |
|
21 |
+ article=r.html.find("main")[0] |
|
22 |
+ article_only+=article.html |
|
23 |
+ lenBefore=len(article_only) |
|
24 |
+ say("LengthBefore: "+str(lenBefore)) |
|
25 |
+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n" |
|
26 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n" |
|
27 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n" |
|
28 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n" |
|
29 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n" |
|
30 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n" |
|
31 |
+ #pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n" |
|
32 |
+ |
|
33 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
34 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
35 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
36 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
37 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
38 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
39 |
+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S) |
|
40 |
+ article_only = re.sub(r'<script(.+?)/>','',article_only) |
|
41 |
+ #article_only = re.sub(r'','',article_only) |
|
42 |
+ article_only = re.sub(r"href=\"/",'href="https://xxxxx/',article_only) |
|
43 |
+ article_only = re.sub(r"src=\"/",'src="https://xxxxx/',article_only) |
|
44 |
+ article_only = re.sub(r"^$",'',article_only) |
|
45 |
+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S) |
|
46 |
+ article_only = re.sub(r"><",'>\n<',article_only) |
|
47 |
+ |
|
48 |
+ #pageContent += "\n"+article_only+"\n" |
|
49 |
+ pageContent += "<article>\n"+article_only+"\n</article>\n" |
|
50 |
+ lenAfter=len(article_only) |
|
51 |
+ lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
52 |
+ say("LengthAfter : "+str(lenAfter)) |
|
53 |
+ say("Gain : "+str(lenGain)+"%") |
|
54 |
+ return pageContent |
|
55 |
+ |
|
56 |
+if __name__ == "__main__": |
|
57 |
+ #global server |
|
58 |
+ url="https://www.nytimes.com/2023/03/05/technology/artificial-intelligence-breast-cancer-detection.html" |
|
59 |
+ print(url) |
|
60 |
+ article(url) |
... | ... |
@@ -1,48 +0,0 @@ |
1 |
-from userio import * |
|
2 |
-import requests |
|
3 |
-import re |
|
4 |
-import newsParser |
|
5 |
- |
|
6 |
-def article(url): |
|
7 |
- say("Article: "+url) |
|
8 |
- r = requests.get(url, allow_redirects=True) |
|
9 |
- content = r.text |
|
10 |
- |
|
11 |
- articleStrImageUrl = newsParser.articleImage(content) |
|
12 |
- articleStrTitle = newsParser.articleTitle(content) |
|
13 |
- articleStrDescription = newsParser.articleDescription(content) |
|
14 |
- |
|
15 |
- pageContent = "" |
|
16 |
- pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
17 |
- pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
18 |
- pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
19 |
- pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
20 |
- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
21 |
- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
22 |
- |
|
23 |
- articleCstBegin = "<section name=\"articleBody\"" |
|
24 |
- articleCstEnd = "</article>" |
|
25 |
- indexBegin = content.index(articleCstBegin) |
|
26 |
- indexEnd = content.index(articleCstEnd) |
|
27 |
- |
|
28 |
- article_only = "" |
|
29 |
- article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
30 |
- article_only += "<em>"+articleStrDescription+"</em>\n" |
|
31 |
- article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
32 |
- article_only += content[indexBegin:indexEnd] |
|
33 |
- article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only) |
|
34 |
- article_only = re.sub(r"<h2", '<h3', article_only) |
|
35 |
- article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
36 |
- article_only = re.sub(r"<h1", '<h2', article_only) |
|
37 |
- article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
38 |
- article_only = re.sub(r"<p>Advertisement</p>", '', article_only) |
|
39 |
- # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only) |
|
40 |
- # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only) |
|
41 |
- article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only) |
|
42 |
- article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only) |
|
43 |
- # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only) |
|
44 |
- article_only = article_only.replace("><", ">\n<") |
|
45 |
- |
|
46 |
- article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only) |
|
47 |
- pageContent += "<article>"+article_only+"</article>" |
|
48 |
- return pageContent |
... | ... |
@@ -8,10 +8,26 @@ def article(url): |
8 | 8 |
r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
9 | 9 |
content = r.text |
10 | 10 |
pageContent = "" |
11 |
- articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\"" |
|
12 |
- articleCstEnd = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">" |
|
13 |
- indexBegin = content.index(articleCstBegin) |
|
14 |
- indexEnd = content.index(articleCstEnd) |
|
11 |
+ indexBegin=None |
|
12 |
+ indexEnd=None |
|
13 |
+ |
|
14 |
+ try: |
|
15 |
+ articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\"" |
|
16 |
+ articleCstEnd = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">" |
|
17 |
+ indexBegin = content.index(articleCstBegin) |
|
18 |
+ indexEnd = content.index(articleCstEnd) |
|
19 |
+ except: |
|
20 |
+ pass |
|
21 |
+ |
|
22 |
+ if None == indexBegin: |
|
23 |
+ try: |
|
24 |
+ articleCstBegin = '<div class="layout layout--onecol">' |
|
25 |
+ articleCstEnd = '<div class="field-name-body"' |
|
26 |
+ indexBegin = content.index(articleCstBegin) |
|
27 |
+ indexEnd = content.index(articleCstEnd) |
|
28 |
+ except: |
|
29 |
+ pass |
|
30 |
+ |
|
15 | 31 |
|
16 | 32 |
articleStrImageUrl = newsParser.articleImage(content) |
17 | 33 |
articleStrTitle = newsParser.articleTitle(content) |
... | ... |
@@ -87,7 +87,10 @@ def article(url): |
87 | 87 |
article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S) |
88 | 88 |
article_only = re.sub(r"><",'>\n<',article_only) |
89 | 89 |
|
90 |
- #pageContent += "\n"+article_only+"\n" |
|
90 |
+ if "<article" in article_only: |
|
91 |
+ #pageContent += "\n"+article_only+"\n" |
|
92 |
+ article_only = re.sub(r"<article (.+?)>", '', article_only) |
|
93 |
+ article_only = re.sub(r"</article>", '', article_only) |
|
91 | 94 |
pageContent += "<article>\n"+article_only+"\n</article>\n" |
92 | 95 |
lenAfter=len(article_only) |
93 | 96 |
lenGain=0 |