| ... | ... |
@@ -12,6 +12,7 @@ from .newsParser import newsBondyBlog |
| 12 | 12 |
from .newsParser import newsBuzzfeedCom |
| 13 | 13 |
from .newsParser import newsChallengesFr |
| 14 | 14 |
from .newsParser import newsCNA |
| 15 |
+from .newsParser import newsCourrier |
|
| 15 | 16 |
from .newsParser import newsDefault |
| 16 | 17 |
from .newsParser import newsDNA |
| 17 | 18 |
from .newsParser import newsFranceTVInfo |
| ... | ... |
@@ -27,7 +28,6 @@ from .newsParser import newsMediapart |
| 27 | 28 |
from .newsParser import newsMidiLibre |
| 28 | 29 |
from .newsParser import newsMothershipSG |
| 29 | 30 |
from .newsParser import newsNewYorker |
| 30 |
-from .newsParser import newsNewYorkTimes |
|
| 31 | 31 |
from .newsParser import newsNouvelObs |
| 32 | 32 |
from .newsParser import newsNSTMy |
| 33 | 33 |
from .newsParser import newsSCMP |
| ... | ... |
@@ -46,6 +46,7 @@ from .newsParser import newsYahooCom |
| 46 | 46 |
from .newsParser import newsZDNetFr |
| 47 | 47 |
# ~ from .newsParser import newsXXXXXX |
| 48 | 48 |
from .newsParser import accountMediapart |
| 49 |
+from .newsParser import accountCourrier |
|
| 49 | 50 |
|
| 50 | 51 |
def supportedList(): |
| 51 | 52 |
current_module = __import__(__name__) |
| ... | ... |
@@ -213,6 +214,8 @@ def getArticle(url): |
| 213 | 214 |
data_page += newsParser.newsSlateFr.article(url) |
| 214 | 215 |
elif "mediapart.fr" in url: |
| 215 | 216 |
data_page += newsParser.newsMediapart.article(url) |
| 217 |
+ elif "courrierinternational.com" in url: |
|
| 218 |
+ data_page += newsParser.newsCourrier.article(url) |
|
| 216 | 219 |
else: |
| 217 | 220 |
data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n" |
| 218 | 221 |
#data_page += "<p>Supported News:" |
| ... | ... |
@@ -0,0 +1,98 @@ |
| 1 |
+from userio import * |
|
| 2 |
+import requests |
|
| 3 |
+import re |
|
| 4 |
+import newsParser |
|
| 5 |
+from requests_html import HTML |
|
| 6 |
+from requests_html import HTMLSession |
|
| 7 |
+from bs4 import BeautifulSoup |
|
| 8 |
+ |
|
| 9 |
+def article(url): |
|
| 10 |
+ say("Article: "+url)
|
|
| 11 |
+ pageContent="" |
|
| 12 |
+ article_only="" |
|
| 13 |
+ |
|
| 14 |
+ htmlContent=newsParser.accountCourrier.getArticle(url) |
|
| 15 |
+ #htmlContent="" |
|
| 16 |
+ #with open("toto.html") as f:
|
|
| 17 |
+ # htmlContent=f.read() |
|
| 18 |
+ |
|
| 19 |
+ articleStrTitle = "" |
|
| 20 |
+ articleStrDescription = "" |
|
| 21 |
+ articleStrImageUrl = "" |
|
| 22 |
+ articleStrAuthor = "" |
|
| 23 |
+ try: |
|
| 24 |
+ articleStrTitle = re.search("<meta property=\"og:title\" content=\"(.+?)\" \/>",htmlContent).group(1)
|
|
| 25 |
+ except: |
|
| 26 |
+ pass |
|
| 27 |
+ try: |
|
| 28 |
+ articleStrDescription = re.search("<meta property=\"og:description\" content=\"(.+?)\" \/>",htmlContent).group(1)
|
|
| 29 |
+ except: |
|
| 30 |
+ pass |
|
| 31 |
+ try: |
|
| 32 |
+ articleStrImageUrl = re.search("<meta property=\"og:image\" content=\"(.+?)\" \/>",htmlContent).group(1)
|
|
| 33 |
+ except: |
|
| 34 |
+ pass |
|
| 35 |
+ try: |
|
| 36 |
+ articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
|
|
| 37 |
+ except: |
|
| 38 |
+ pass |
|
| 39 |
+ with open("toto.html","w") as f:
|
|
| 40 |
+ f.write(htmlContent) |
|
| 41 |
+ print("Title:"+articleStrTitle)
|
|
| 42 |
+ |
|
| 43 |
+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n" |
|
| 44 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n" |
|
| 45 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n" |
|
| 46 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n" |
|
| 47 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n" |
|
| 48 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n" |
|
| 49 |
+ pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n" |
|
| 50 |
+ articleElementBegin="<article" |
|
| 51 |
+ #articleElementEnd ="<div class=\"news__aside__feedback\">" |
|
| 52 |
+ articleElementEnd ="</article>" |
|
| 53 |
+ print("Start: "+articleElementBegin)
|
|
| 54 |
+ print("End : "+articleElementEnd)
|
|
| 55 |
+ indexElementBegin = htmlContent.index(articleElementBegin) |
|
| 56 |
+ indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin)+len(articleElementEnd) |
|
| 57 |
+ article_only = htmlContent[indexElementBegin:indexElementEnd] |
|
| 58 |
+ lenBefore=len(article_only) |
|
| 59 |
+ say("LengthBefore: "+str(lenBefore))
|
|
| 60 |
+ |
|
| 61 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
| 62 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
| 63 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
| 64 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
| 65 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
| 66 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
| 67 |
+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S) |
|
| 68 |
+ article_only = re.sub(r'<script(.+?)/>','',article_only) |
|
| 69 |
+ article_only = re.sub(r'<button(.+?)</button>','',article_only,flags=re.M|re.S) |
|
| 70 |
+ article_only = re.sub(r' <div role="tooltip" id="disabled-pdf-notification">(.+?)</div>','',article_only,flags=re.M|re.S) |
|
| 71 |
+ article_only = re.sub(r'<svg class="icon"(.+?)</svg>','',article_only,flags=re.M|re.S) |
|
| 72 |
+ article_only = re.sub(r'<span class="is-visually-hidden">Partager(.+?)</span>','',article_only,flags=re.M|re.S) |
|
| 73 |
+ #article_only = re.sub(r'<a href=\"(.+?)data-smarttag-name="partage_(.+?)"(.+?)data-smarttag-type="action">(.+?)</a>','AAAAA ',article_only,flags=re.M|re.S) |
|
| 74 |
+ article_only = re.sub(r'<span>Offrir<span class="is-hidden-until-md"> l’article</span>','',article_only) |
|
| 75 |
+ #article_only = re.sub(r'','',article_only) |
|
| 76 |
+ article_only = re.sub(r"href=\"/",'href="https://mediapart.fr/',article_only) |
|
| 77 |
+ article_only = re.sub(r"src=\"/",'src="https://mediapart.fr/',article_only) |
|
| 78 |
+ article_only = re.sub(r"^$",'',article_only) |
|
| 79 |
+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S) |
|
| 80 |
+ article_only = re.sub(r"><",'>\n<',article_only) |
|
| 81 |
+ |
|
| 82 |
+ lenAfter=len(article_only) |
|
| 83 |
+ lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
| 84 |
+ say("LengthAfter : "+str(lenAfter))
|
|
| 85 |
+ say("Gain : "+str(lenGain)+"%")
|
|
| 86 |
+ #pageContent += "<article>"+article_only+"</article>" |
|
| 87 |
+ pageContent += article_only |
|
| 88 |
+ return pageContent |
|
| 89 |
+ |
|
| 90 |
+ |
|
| 91 |
+ #pageContent += "\n"+article_only+"\n" |
|
| 92 |
+ pageContent += "<article>\n"+article_only+"\n</article>\n" |
|
| 93 |
+ lenAfter=len(article_only) |
|
| 94 |
+ #lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
| 95 |
+ say("LengthAfter : "+str(lenAfter))
|
|
| 96 |
+ #say("Gain : "+str(lenGain)+"%")
|
|
| 97 |
+ return pageContent |
|
| 98 |
+ |
| ... | ... |
@@ -5,9 +5,9 @@ import re |
| 5 | 5 |
def articleImage(content): |
| 6 | 6 |
articleImgBegin ="<meta property=\"og:image\" content=\"" |
| 7 | 7 |
articleImgEnd ="\"" |
| 8 |
- indexImgBegin = content.index(articleImgBegin) |
|
| 9 |
- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 10 | 8 |
try: |
| 9 |
+ indexImgBegin = content.index(articleImgBegin) |
|
| 10 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 11 | 11 |
image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
| 12 | 12 |
except: |
| 13 | 13 |
image = "favicon.png" |
| ... | ... |
@@ -16,21 +16,31 @@ def articleImage(content): |
| 16 | 16 |
def articleDescription(content): |
| 17 | 17 |
articleImgBegin ="<meta property=\"og:description\" content=\"" |
| 18 | 18 |
articleImgEnd ="\"" |
| 19 |
- indexImgBegin = content.index(articleImgBegin) |
|
| 20 |
- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 21 | 19 |
try: |
| 20 |
+ indexImgBegin = content.index(articleImgBegin) |
|
| 21 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 22 | 22 |
title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
| 23 | 23 |
except: |
| 24 | 24 |
title = "Description Extraction Failed" |
| 25 | 25 |
return title |
| 26 | 26 |
|
| 27 |
+ |
|
| 28 |
+def checkBlock(content): |
|
| 29 |
+ blockString="Attention Required! | Cloudflare" |
|
| 30 |
+ try: |
|
| 31 |
+ indexBlock = content.index(blockString) |
|
| 32 |
+ except: |
|
| 33 |
+ indexBlock = None |
|
| 34 |
+ say("indexBlock: "+str(indexBlock))
|
|
| 35 |
+ return indexBlock |
|
| 36 |
+ |
|
| 27 | 37 |
def articleTitle(content): |
| 28 | 38 |
#articleImgBegin ="<meta property=\"og:title\" content=\"" |
| 29 | 39 |
articleImgBegin ="\"og:title\" content=\"" |
| 30 | 40 |
articleImgEnd ="\"" |
| 31 |
- indexImgBegin = content.index(articleImgBegin) |
|
| 32 |
- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 33 | 41 |
try: |
| 42 |
+ indexImgBegin = content.index(articleImgBegin) |
|
| 43 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 34 | 44 |
title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
| 35 | 45 |
except: |
| 36 | 46 |
title = "Title Extraction Failed" |
| ... | ... |
@@ -41,17 +51,26 @@ def article(url): |
| 41 | 51 |
r = requests.get(url, allow_redirects=True) |
| 42 | 52 |
content = r.text |
| 43 | 53 |
|
| 44 |
- articleStrImageUrl = articleImage(content) |
|
| 45 |
- articleStrTitle = articleTitle(content) |
|
| 46 |
- articleStrDescription = articleDescription(content) |
|
| 54 |
+ with open("test.html","w") as f:
|
|
| 55 |
+ f.write(content) |
|
| 56 |
+ |
|
| 57 |
+ if checkBlock(content) == None: |
|
| 58 |
+ articleStrImageUrl = None |
|
| 59 |
+ articleStrTitle = "No Title" |
|
| 60 |
+ articleStrDescription = "No Description" |
|
| 61 |
+ else: |
|
| 62 |
+ articleStrImageUrl = articleImage(content) |
|
| 63 |
+ articleStrTitle = articleTitle(content) |
|
| 64 |
+ articleStrDescription = articleDescription(content) |
|
| 47 | 65 |
|
| 48 | 66 |
pageContent = "" |
| 49 | 67 |
pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
| 50 | 68 |
pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
| 51 | 69 |
pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
| 52 | 70 |
pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
| 53 |
- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
| 54 |
- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
| 71 |
+ if None is not articleStrImageUrl: |
|
| 72 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
| 73 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
| 55 | 74 |
|
| 56 | 75 |
articleCstBegin = "<article" |
| 57 | 76 |
articleCstEnd = "</article>" |
| ... | ... |
@@ -14,9 +14,11 @@ def article(url): |
| 14 | 14 |
article_only="" |
| 15 | 15 |
|
| 16 | 16 |
htmlContent=newsParser.accountMediapart.getArticle(url) |
| 17 |
- #htmlContent="" |
|
| 18 |
- #with open("toto.html") as f:
|
|
| 17 |
+ #with open("toto.html","w") as f:
|
|
| 18 |
+ # f.write(htmlContent) |
|
| 19 |
+ #with open("toto.html","r") as f:
|
|
| 19 | 20 |
# htmlContent=f.read() |
| 21 |
+ #print(len(htmlContent)) |
|
| 20 | 22 |
|
| 21 | 23 |
articleStrTitle = "" |
| 22 | 24 |
articleStrDescription = "" |
| ... | ... |
@@ -38,8 +40,6 @@ def article(url): |
| 38 | 40 |
articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
|
| 39 | 41 |
except: |
| 40 | 42 |
pass |
| 41 |
- #with open("toto.html","w") as f:
|
|
| 42 |
- # f.write(htmlContent) |
|
| 43 | 43 |
|
| 44 | 44 |
pageContent += "<meta property=\"og:type\" content=\"article\" />\n" |
| 45 | 45 |
pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n" |
| ... | ... |
@@ -49,10 +49,19 @@ def article(url): |
| 49 | 49 |
pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n" |
| 50 | 50 |
pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n" |
| 51 | 51 |
articleElementBegin="<div class=\"news__heading__top" |
| 52 |
- #articleElementEnd ="<div class=\"news__aside__feedback\">" |
|
| 53 | 52 |
articleElementEnd ="<hr class=\"divider-horizontal" |
| 53 |
+ articleElementEnd2 ="<div class=\"news__aside__feedback\">" |
|
| 54 |
+ articleElementEnd3 ="</body>" |
|
| 54 | 55 |
indexElementBegin = htmlContent.index(articleElementBegin) |
| 55 |
- indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin) |
|
| 56 |
+ try: |
|
| 57 |
+ indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin) |
|
| 58 |
+ except: |
|
| 59 |
+ try: |
|
| 60 |
+ indexElementEnd = htmlContent.index(articleElementEnd2,indexElementBegin) |
|
| 61 |
+ except: |
|
| 62 |
+ indexElementEnd = htmlContent.index(articleElementEnd3,indexElementBegin) |
|
| 63 |
+ |
|
| 64 |
+ #print(indexElementEnd) |
|
| 56 | 65 |
article_only = "<div>"+htmlContent[indexElementBegin:indexElementEnd] |
| 57 | 66 |
lenBefore=len(article_only) |
| 58 | 67 |
say("LengthBefore: "+str(lenBefore))
|
| ... | ... |
@@ -0,0 +1,60 @@ |
| 1 |
+# ~ from userio import * |
|
| 2 |
+import userio |
|
| 3 |
+import requests |
|
| 4 |
+import re |
|
| 5 |
+import newsParser |
|
| 6 |
+from requests_html import HTML |
|
| 7 |
+from requests_html import HTMLSession |
|
| 8 |
+ |
|
| 9 |
+ |
|
| 10 |
+def article(url): |
|
| 11 |
+ say("Article: "+url)
|
|
| 12 |
+ session = HTMLSession() |
|
| 13 |
+ response = session.get(url,timeout=20) |
|
| 14 |
+ pageContent="" |
|
| 15 |
+ article_only="" |
|
| 16 |
+ with response as r: |
|
| 17 |
+ articleStrTitle = r.html.xpath('//meta[@property="og:title"]/@content')[0]
|
|
| 18 |
+ articleStrDescription = r.html.xpath('//meta[@property="og:description"]/@content')[0]
|
|
| 19 |
+ articleStrImageUrl = r.html.xpath('//meta[@property="og:image"]/@content')[0]
|
|
| 20 |
+ articleStrAuthor = r.html.xpath('//div[@class="author_wrapper"]/@content')
|
|
| 21 |
+ article=r.html.find("main")[0]
|
|
| 22 |
+ article_only+=article.html |
|
| 23 |
+ lenBefore=len(article_only) |
|
| 24 |
+ say("LengthBefore: "+str(lenBefore))
|
|
| 25 |
+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n" |
|
| 26 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n" |
|
| 27 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n" |
|
| 28 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n" |
|
| 29 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n" |
|
| 30 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n" |
|
| 31 |
+ #pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n" |
|
| 32 |
+ |
|
| 33 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
| 34 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
| 35 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
| 36 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
| 37 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
| 38 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
| 39 |
+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S) |
|
| 40 |
+ article_only = re.sub(r'<script(.+?)/>','',article_only) |
|
| 41 |
+ #article_only = re.sub(r'','',article_only) |
|
| 42 |
+ article_only = re.sub(r"href=\"/",'href="https://xxxxx/',article_only) |
|
| 43 |
+ article_only = re.sub(r"src=\"/",'src="https://xxxxx/',article_only) |
|
| 44 |
+ article_only = re.sub(r"^$",'',article_only) |
|
| 45 |
+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S) |
|
| 46 |
+ article_only = re.sub(r"><",'>\n<',article_only) |
|
| 47 |
+ |
|
| 48 |
+ #pageContent += "\n"+article_only+"\n" |
|
| 49 |
+ pageContent += "<article>\n"+article_only+"\n</article>\n" |
|
| 50 |
+ lenAfter=len(article_only) |
|
| 51 |
+ lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
| 52 |
+ say("LengthAfter : "+str(lenAfter))
|
|
| 53 |
+ say("Gain : "+str(lenGain)+"%")
|
|
| 54 |
+ return pageContent |
|
| 55 |
+ |
|
| 56 |
+if __name__ == "__main__": |
|
| 57 |
+ #global server |
|
| 58 |
+ url="https://www.nytimes.com/2023/03/05/technology/artificial-intelligence-breast-cancer-detection.html" |
|
| 59 |
+ print(url) |
|
| 60 |
+ article(url) |
| ... | ... |
@@ -1,48 +0,0 @@ |
| 1 |
-from userio import * |
|
| 2 |
-import requests |
|
| 3 |
-import re |
|
| 4 |
-import newsParser |
|
| 5 |
- |
|
| 6 |
-def article(url): |
|
| 7 |
- say("Article: "+url)
|
|
| 8 |
- r = requests.get(url, allow_redirects=True) |
|
| 9 |
- content = r.text |
|
| 10 |
- |
|
| 11 |
- articleStrImageUrl = newsParser.articleImage(content) |
|
| 12 |
- articleStrTitle = newsParser.articleTitle(content) |
|
| 13 |
- articleStrDescription = newsParser.articleDescription(content) |
|
| 14 |
- |
|
| 15 |
- pageContent = "" |
|
| 16 |
- pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
| 17 |
- pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
| 18 |
- pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
| 19 |
- pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
| 20 |
- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
| 21 |
- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
| 22 |
- |
|
| 23 |
- articleCstBegin = "<section name=\"articleBody\"" |
|
| 24 |
- articleCstEnd = "</article>" |
|
| 25 |
- indexBegin = content.index(articleCstBegin) |
|
| 26 |
- indexEnd = content.index(articleCstEnd) |
|
| 27 |
- |
|
| 28 |
- article_only = "" |
|
| 29 |
- article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
| 30 |
- article_only += "<em>"+articleStrDescription+"</em>\n" |
|
| 31 |
- article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
| 32 |
- article_only += content[indexBegin:indexEnd] |
|
| 33 |
- article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only) |
|
| 34 |
- article_only = re.sub(r"<h2", '<h3', article_only) |
|
| 35 |
- article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
| 36 |
- article_only = re.sub(r"<h1", '<h2', article_only) |
|
| 37 |
- article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
| 38 |
- article_only = re.sub(r"<p>Advertisement</p>", '', article_only) |
|
| 39 |
- # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only) |
|
| 40 |
- # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only) |
|
| 41 |
- article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only) |
|
| 42 |
- article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only) |
|
| 43 |
- # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only) |
|
| 44 |
- article_only = article_only.replace("><", ">\n<")
|
|
| 45 |
- |
|
| 46 |
- article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only) |
|
| 47 |
- pageContent += "<article>"+article_only+"</article>" |
|
| 48 |
- return pageContent |
| ... | ... |
@@ -8,10 +8,26 @@ def article(url): |
| 8 | 8 |
r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
|
| 9 | 9 |
content = r.text |
| 10 | 10 |
pageContent = "" |
| 11 |
- articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\"" |
|
| 12 |
- articleCstEnd = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">" |
|
| 13 |
- indexBegin = content.index(articleCstBegin) |
|
| 14 |
- indexEnd = content.index(articleCstEnd) |
|
| 11 |
+ indexBegin=None |
|
| 12 |
+ indexEnd=None |
|
| 13 |
+ |
|
| 14 |
+ try: |
|
| 15 |
+ articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\"" |
|
| 16 |
+ articleCstEnd = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">" |
|
| 17 |
+ indexBegin = content.index(articleCstBegin) |
|
| 18 |
+ indexEnd = content.index(articleCstEnd) |
|
| 19 |
+ except: |
|
| 20 |
+ pass |
|
| 21 |
+ |
|
| 22 |
+ if None == indexBegin: |
|
| 23 |
+ try: |
|
| 24 |
+ articleCstBegin = '<div class="layout layout--onecol">' |
|
| 25 |
+ articleCstEnd = '<div class="field-name-body"' |
|
| 26 |
+ indexBegin = content.index(articleCstBegin) |
|
| 27 |
+ indexEnd = content.index(articleCstEnd) |
|
| 28 |
+ except: |
|
| 29 |
+ pass |
|
| 30 |
+ |
|
| 15 | 31 |
|
| 16 | 32 |
articleStrImageUrl = newsParser.articleImage(content) |
| 17 | 33 |
articleStrTitle = newsParser.articleTitle(content) |
| ... | ... |
@@ -87,7 +87,10 @@ def article(url): |
| 87 | 87 |
article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S) |
| 88 | 88 |
article_only = re.sub(r"><",'>\n<',article_only) |
| 89 | 89 |
|
| 90 |
- #pageContent += "\n"+article_only+"\n" |
|
| 90 |
+ if "<article" in article_only: |
|
| 91 |
+ #pageContent += "\n"+article_only+"\n" |
|
| 92 |
+ article_only = re.sub(r"<article (.+?)>", '', article_only) |
|
| 93 |
+ article_only = re.sub(r"</article>", '', article_only) |
|
| 91 | 94 |
pageContent += "<article>\n"+article_only+"\n</article>\n" |
| 92 | 95 |
lenAfter=len(article_only) |
| 93 | 96 |
lenGain=0 |