2023-04 upgrade ・ 1c94a19 ・ Gitprep

- 2023-04 upgrade;
- Browse files
- ycawidro commited on 2023-04-14
- 1 parent b8c7e53
  
  commit 1c94a1905fcea1eb7087f5bb29319f857657c0c5

Showing 8 changed files with 231 additions and 71 deletions

+4 -1

newsParser/__init__.py

@@ -12,6 +12,7 @@ from .newsParser import newsBondyBlog
 from .newsParser import newsBuzzfeedCom
 from .newsParser import newsChallengesFr
 from .newsParser import newsCNA
+from .newsParser import newsCourrier
 from .newsParser import newsDefault
 from .newsParser import newsDNA
 from .newsParser import newsFranceTVInfo
@@ -27,7 +28,6 @@ from .newsParser import newsMediapart
 from .newsParser import newsMidiLibre
 from .newsParser import newsMothershipSG
 from .newsParser import newsNewYorker
-from .newsParser import newsNewYorkTimes
 from .newsParser import newsNouvelObs
 from .newsParser import newsNSTMy
 from .newsParser import newsSCMP
@@ -46,6 +46,7 @@ from .newsParser import newsYahooCom
 from .newsParser import newsZDNetFr
 # ~ from .newsParser import newsXXXXXX
 from .newsParser import accountMediapart
+from .newsParser import accountCourrier
 
 def supportedList():
   current_module = __import__(__name__)
@@ -213,6 +214,8 @@ def getArticle(url):
       data_page += newsParser.newsSlateFr.article(url)
     elif "mediapart.fr" in url:
       data_page += newsParser.newsMediapart.article(url)
+    elif "courrierinternational.com" in url:
+      data_page += newsParser.newsCourrier.article(url)
     else:
        data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
        #data_page += "<p>Supported News:"

+98

newsParser/newsParser/newsCourrier.py

View

...	...	@@ -0,0 +1,98 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+from requests_html import HTML
	6	+from requests_html import HTMLSession
	7	+from bs4 import BeautifulSoup
	8	+
	9	+def article(url):
	10	+ say("Article: "+url)
	11	+ pageContent=""
	12	+ article_only=""
	13	+
	14	+ htmlContent=newsParser.accountCourrier.getArticle(url)
	15	+ #htmlContent=""
	16	+ #with open("toto.html") as f:
	17	+ # htmlContent=f.read()
	18	+
	19	+ articleStrTitle = ""
	20	+ articleStrDescription = ""
	21	+ articleStrImageUrl = ""
	22	+ articleStrAuthor = ""
	23	+ try:
	24	+ articleStrTitle = re.search("<meta property=\"og:title\" content=\"(.+?)\" \/>",htmlContent).group(1)
	25	+ except:
	26	+ pass
	27	+ try:
	28	+ articleStrDescription = re.search("<meta property=\"og:description\" content=\"(.+?)\" \/>",htmlContent).group(1)
	29	+ except:
	30	+ pass
	31	+ try:
	32	+ articleStrImageUrl = re.search("<meta property=\"og:image\" content=\"(.+?)\" \/>",htmlContent).group(1)
	33	+ except:
	34	+ pass
	35	+ try:
	36	+ articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
	37	+ except:
	38	+ pass
	39	+ with open("toto.html","w") as f:
	40	+ f.write(htmlContent)
	41	+ print("Title:"+articleStrTitle)
	42	+
	43	+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
	44	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
	45	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n"
	46	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n"
	47	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n"
	48	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
	49	+ pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
	50	+ articleElementBegin="<article"
	51	+ #articleElementEnd ="<div class=\"news__aside__feedback\">"
	52	+ articleElementEnd ="</article>"
	53	+ print("Start: "+articleElementBegin)
	54	+ print("End : "+articleElementEnd)
	55	+ indexElementBegin = htmlContent.index(articleElementBegin)
	56	+ indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin)+len(articleElementEnd)
	57	+ article_only = htmlContent[indexElementBegin:indexElementEnd]
	58	+ lenBefore=len(article_only)
	59	+ say("LengthBefore: "+str(lenBefore))
	60	+
	61	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	62	+ article_only = re.sub(r"</amp-img>", '', article_only)
	63	+ article_only = re.sub(r"<h2", '<h3', article_only)
	64	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	65	+ article_only = re.sub(r"<h1", '<h2', article_only)
	66	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	67	+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M\|re.S)
	68	+ article_only = re.sub(r'<script(.+?)/>','',article_only)
	69	+ article_only = re.sub(r'<button(.+?)</button>','',article_only,flags=re.M\|re.S)
	70	+ article_only = re.sub(r' <div role="tooltip" id="disabled-pdf-notification">(.+?)</div>','',article_only,flags=re.M\|re.S)
	71	+ article_only = re.sub(r'<svg class="icon"(.+?)</svg>','',article_only,flags=re.M\|re.S)
	72	+ article_only = re.sub(r'<span class="is-visually-hidden">Partager(.+?)</span>','',article_only,flags=re.M\|re.S)
	73	+ #article_only = re.sub(r'<a href=\"(.+?)data-smarttag-name="partage_(.+?)"(.+?)data-smarttag-type="action">(.+?)</a>','AAAAA ',article_only,flags=re.M\|re.S)
	74	+ article_only = re.sub(r'<span>Offrir<span class="is-hidden-until-md"> l’article</span>','',article_only)
	75	+ #article_only = re.sub(r'','',article_only)
	76	+ article_only = re.sub(r"href=\"/",'href="https://mediapart.fr/',article_only)
	77	+ article_only = re.sub(r"src=\"/",'src="https://mediapart.fr/',article_only)
	78	+ article_only = re.sub(r"^$",'',article_only)
	79	+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M\|re.S)
	80	+ article_only = re.sub(r"><",'>\n<',article_only)
	81	+
	82	+ lenAfter=len(article_only)
	83	+ lenGain=float(10000-int(float(100lenAfter/lenBefore)100))/100
	84	+ say("LengthAfter : "+str(lenAfter))
	85	+ say("Gain : "+str(lenGain)+"%")
	86	+ #pageContent += "<article>"+article_only+"</article>"
	87	+ pageContent += article_only
	88	+ return pageContent
	89	+
	90	+
	91	+ #pageContent += "\n"+article_only+"\n"
	92	+ pageContent += "<article>\n"+article_only+"\n</article>\n"
	93	+ lenAfter=len(article_only)
	94	+ #lenGain=float(10000-int(float(100lenAfter/lenBefore)100))/100
	95	+ say("LengthAfter : "+str(lenAfter))
	96	+ #say("Gain : "+str(lenGain)+"%")
	97	+ return pageContent
	98	+

+30 -11

newsParser/newsParser/newsDefault.py

View

@@ -5,9 +5,9 @@ import re
 def articleImage(content):
   articleImgBegin ="<meta property=\"og:image\" content=\""
   articleImgEnd   ="\""
-  indexImgBegin = content.index(articleImgBegin)
-  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
   try:
+    indexImgBegin = content.index(articleImgBegin)
+    indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
     image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
   except:
     image = "favicon.png"
@@ -16,21 +16,31 @@ def articleImage(content):
 def articleDescription(content):
   articleImgBegin ="<meta property=\"og:description\" content=\""
   articleImgEnd   ="\""
-  indexImgBegin = content.index(articleImgBegin)
-  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
   try:
+    indexImgBegin = content.index(articleImgBegin)
+    indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
     title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
   except:
     title = "Description Extraction Failed"
   return title
 
+
+def checkBlock(content):
+  blockString="Attention Required! | Cloudflare"
+  try:
+    indexBlock = content.index(blockString)
+  except:
+    indexBlock = None
+  say("indexBlock: "+str(indexBlock))
+  return indexBlock
+
 def articleTitle(content):
   #articleImgBegin ="<meta property=\"og:title\" content=\""
   articleImgBegin ="\"og:title\" content=\""
   articleImgEnd   ="\""
-  indexImgBegin = content.index(articleImgBegin)
-  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
   try:
+    indexImgBegin = content.index(articleImgBegin)
+    indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
     title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
   except:
     title = "Title Extraction Failed"
@@ -41,17 +51,26 @@ def article(url):
   r = requests.get(url, allow_redirects=True)
   content = r.text
 
-  articleStrImageUrl = articleImage(content)
-  articleStrTitle = articleTitle(content)
-  articleStrDescription = articleDescription(content)
+  with open("test.html","w") as f:
+    f.write(content)
+    
+  if checkBlock(content) == None:
+    articleStrImageUrl = None
+    articleStrTitle = "No Title"
+    articleStrDescription = "No Description"
+  else:
+    articleStrImageUrl = articleImage(content)
+    articleStrTitle = articleTitle(content)
+    articleStrDescription = articleDescription(content)
   
   pageContent = ""
   pageContent += "<meta property=\"og:type\" content=\"article\">\n"
   pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
   pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
   pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
-  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
-  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
+  if None is not articleStrImageUrl:
+    pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
+    pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
   
   articleCstBegin = "<article"
   articleCstEnd   = "</article>"

+15 -6

newsParser/newsParser/newsMediapart.py

View

@@ -14,9 +14,11 @@ def article(url):
   article_only=""
 
   htmlContent=newsParser.accountMediapart.getArticle(url)
-  #htmlContent=""
-  #with open("toto.html") as f:
+  #with open("toto.html","w") as f:
+  #  f.write(htmlContent)
+  #with open("toto.html","r") as f:
   #  htmlContent=f.read()
+  #print(len(htmlContent))
 
   articleStrTitle = ""
   articleStrDescription = ""
@@ -38,8 +40,6 @@ def article(url):
     articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
   except:
     pass
-  #with open("toto.html","w") as f:
-  #  f.write(htmlContent)
 
   pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
   pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
@@ -49,10 +49,19 @@ def article(url):
   pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
   pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
   articleElementBegin="<div class=\"news__heading__top"
-  #articleElementEnd  ="<div class=\"news__aside__feedback\">"
   articleElementEnd  ="<hr class=\"divider-horizontal"
+  articleElementEnd2  ="<div class=\"news__aside__feedback\">"
+  articleElementEnd3  ="</body>"
   indexElementBegin  = htmlContent.index(articleElementBegin)
-  indexElementEnd    = htmlContent.index(articleElementEnd,indexElementBegin)
+  try:
+    indexElementEnd    = htmlContent.index(articleElementEnd,indexElementBegin)
+  except:
+    try:
+      indexElementEnd    = htmlContent.index(articleElementEnd2,indexElementBegin)
+    except:
+      indexElementEnd    = htmlContent.index(articleElementEnd3,indexElementBegin)
+    
+  #print(indexElementEnd)
   article_only = "<div>"+htmlContent[indexElementBegin:indexElementEnd]
   lenBefore=len(article_only)
   say("LengthBefore: "+str(lenBefore))

+60

newsParser/newsParser/newsNYT.py

View

...	...	@@ -0,0 +1,60 @@
	1	+# ~ from userio import *
	2	+import userio
	3	+import requests
	4	+import re
	5	+import newsParser
	6	+from requests_html import HTML
	7	+from requests_html import HTMLSession
	8	+
	9	+
	10	+def article(url):
	11	+ say("Article: "+url)
	12	+ session = HTMLSession()
	13	+ response = session.get(url,timeout=20)
	14	+ pageContent=""
	15	+ article_only=""
	16	+ with response as r:
	17	+ articleStrTitle = r.html.xpath('//meta[@property="og:title"]/@content')[0]
	18	+ articleStrDescription = r.html.xpath('//meta[@property="og:description"]/@content')[0]
	19	+ articleStrImageUrl = r.html.xpath('//meta[@property="og:image"]/@content')[0]
	20	+ articleStrAuthor = r.html.xpath('//div[@class="author_wrapper"]/@content')
	21	+ article=r.html.find("main")[0]
	22	+ article_only+=article.html
	23	+ lenBefore=len(article_only)
	24	+ say("LengthBefore: "+str(lenBefore))
	25	+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
	26	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
	27	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n"
	28	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n"
	29	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n"
	30	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
	31	+ #pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
	32	+
	33	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	34	+ article_only = re.sub(r"</amp-img>", '', article_only)
	35	+ article_only = re.sub(r"<h2", '<h3', article_only)
	36	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	37	+ article_only = re.sub(r"<h1", '<h2', article_only)
	38	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	39	+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M\|re.S)
	40	+ article_only = re.sub(r'<script(.+?)/>','',article_only)
	41	+ #article_only = re.sub(r'','',article_only)
	42	+ article_only = re.sub(r"href=\"/",'href="https://xxxxx/',article_only)
	43	+ article_only = re.sub(r"src=\"/",'src="https://xxxxx/',article_only)
	44	+ article_only = re.sub(r"^$",'',article_only)
	45	+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M\|re.S)
	46	+ article_only = re.sub(r"><",'>\n<',article_only)
	47	+
	48	+ #pageContent += "\n"+article_only+"\n"
	49	+ pageContent += "<article>\n"+article_only+"\n</article>\n"
	50	+ lenAfter=len(article_only)
	51	+ lenGain=float(10000-int(float(100lenAfter/lenBefore)100))/100
	52	+ say("LengthAfter : "+str(lenAfter))
	53	+ say("Gain : "+str(lenGain)+"%")
	54	+ return pageContent
	55	+
	56	+if __name__ == "__main__":
	57	+ #global server
	58	+ url="https://www.nytimes.com/2023/03/05/technology/artificial-intelligence-breast-cancer-detection.html"
	59	+ print(url)
	60	+ article(url)

-48

newsParser/newsParser/newsNewYorkTimes.py

View

@@ -1,48 +0,0 @@
-from userio import *
-import requests
-import re
-import newsParser
-  
-def article(url):
-  say("Article: "+url)
-  r = requests.get(url, allow_redirects=True)
-  content = r.text
-
-  articleStrImageUrl = newsParser.articleImage(content)
-  articleStrTitle = newsParser.articleTitle(content)
-  articleStrDescription = newsParser.articleDescription(content)
-  
-  pageContent = ""
-  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
-  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
-  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
-  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
-  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
-  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
-  
-  articleCstBegin = "<section name=\"articleBody\""
-  articleCstEnd   = "</article>"
-  indexBegin = content.index(articleCstBegin)
-  indexEnd   = content.index(articleCstEnd)
-  
-  article_only = ""
-  article_only += "<h2>"+articleStrTitle+"</h2>\n"
-  article_only += "<em>"+articleStrDescription+"</em>\n"
-  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
-  article_only += content[indexBegin:indexEnd]
-  article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
-  article_only = re.sub(r"<h2", '<h3', article_only)
-  article_only = re.sub(r"</h2>", '</h3>', article_only)
-  article_only = re.sub(r"<h1", '<h2', article_only)
-  article_only = re.sub(r"</h1>", '</h2>', article_only)
-  article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
-  # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
-  # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
-  article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
-  article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
-  # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only)
-  article_only = article_only.replace("><", ">\n<")
-  
-  article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
-  pageContent += "<article>"+article_only+"</article>"
-  return pageContent

+20 -4

newsParser/newsParser/newsStraitsTimes.py

View

@@ -8,10 +8,26 @@ def article(url):
   r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
   content = r.text
   pageContent = ""
-  articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\""
-  articleCstEnd   = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">"
-  indexBegin = content.index(articleCstBegin)
-  indexEnd   = content.index(articleCstEnd)
+  indexBegin=None
+  indexEnd=None
+
+  try:
+    articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\""
+    articleCstEnd   = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">"
+    indexBegin = content.index(articleCstBegin)
+    indexEnd   = content.index(articleCstEnd)
+  except:
+    pass
+
+  if None == indexBegin:
+    try:
+      articleCstBegin = '<div class="layout layout--onecol">'
+      articleCstEnd   = '<div class="field-name-body"'
+      indexBegin = content.index(articleCstBegin)
+      indexEnd   = content.index(articleCstEnd)
+    except:
+      pass
+
 
   articleStrImageUrl = newsParser.articleImage(content)
   articleStrTitle = newsParser.articleTitle(content)

+4 -1

newsParser/newsParser/newsWaPo.py

View

@@ -87,7 +87,10 @@ def article(url):
   article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S)
   article_only = re.sub(r"><",'>\n<',article_only)
 
-  #pageContent += "\n"+article_only+"\n"
+  if "<article" in article_only:
+    #pageContent += "\n"+article_only+"\n"
+    article_only = re.sub(r"<article (.+?)>", '', article_only)
+    article_only = re.sub(r"</article>", '', article_only)
   pageContent += "<article>\n"+article_only+"\n</article>\n"
   lenAfter=len(article_only)
   lenGain=0

...	...	@@ -12,6 +12,7 @@ from .newsParser import newsBondyBlog
12	12	from .newsParser import newsBuzzfeedCom
13	13	from .newsParser import newsChallengesFr
14	14	from .newsParser import newsCNA
	15	+from .newsParser import newsCourrier
15	16	from .newsParser import newsDefault
16	17	from .newsParser import newsDNA
17	18	from .newsParser import newsFranceTVInfo
...	...	@@ -27,7 +28,6 @@ from .newsParser import newsMediapart
27	28	from .newsParser import newsMidiLibre
28	29	from .newsParser import newsMothershipSG
29	30	from .newsParser import newsNewYorker
30		-from .newsParser import newsNewYorkTimes
31	31	from .newsParser import newsNouvelObs
32	32	from .newsParser import newsNSTMy
33	33	from .newsParser import newsSCMP
...	...	@@ -46,6 +46,7 @@ from .newsParser import newsYahooCom
46	46	from .newsParser import newsZDNetFr
47	47	# ~ from .newsParser import newsXXXXXX
48	48	from .newsParser import accountMediapart
	49	+from .newsParser import accountCourrier
49	50
50	51	def supportedList():
51	52	current_module = __import__(__name__)
...	...	@@ -213,6 +214,8 @@ def getArticle(url):
213	214	data_page += newsParser.newsSlateFr.article(url)
214	215	elif "mediapart.fr" in url:
215	216	data_page += newsParser.newsMediapart.article(url)
	217	+ elif "courrierinternational.com" in url:
	218	+ data_page += newsParser.newsCourrier.article(url)
216	219	else:
217	220	data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
218	221	#data_page += "<p>Supported News:"

...	...	@@ -5,9 +5,9 @@ import re
5	5	def articleImage(content):
6	6	articleImgBegin ="<meta property=\"og:image\" content=\""
7	7	articleImgEnd ="\""
8		- indexImgBegin = content.index(articleImgBegin)
9		- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
10	8	try:
	9	+ indexImgBegin = content.index(articleImgBegin)
	10	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
11	11	image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12	12	except:
13	13	image = "favicon.png"
...	...	@@ -16,21 +16,31 @@ def articleImage(content):
16	16	def articleDescription(content):
17	17	articleImgBegin ="<meta property=\"og:description\" content=\""
18	18	articleImgEnd ="\""
19		- indexImgBegin = content.index(articleImgBegin)
20		- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
21	19	try:
	20	+ indexImgBegin = content.index(articleImgBegin)
	21	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
22	22	title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
23	23	except:
24	24	title = "Description Extraction Failed"
25	25	return title
26	26
	27	+
	28	+def checkBlock(content):
	29	+ blockString="Attention Required! \| Cloudflare"
	30	+ try:
	31	+ indexBlock = content.index(blockString)
	32	+ except:
	33	+ indexBlock = None
	34	+ say("indexBlock: "+str(indexBlock))
	35	+ return indexBlock
	36	+
27	37	def articleTitle(content):
28	38	#articleImgBegin ="<meta property=\"og:title\" content=\""
29	39	articleImgBegin ="\"og:title\" content=\""
30	40	articleImgEnd ="\""
31		- indexImgBegin = content.index(articleImgBegin)
32		- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
33	41	try:
	42	+ indexImgBegin = content.index(articleImgBegin)
	43	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
34	44	title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
35	45	except:
36	46	title = "Title Extraction Failed"
...	...	@@ -41,17 +51,26 @@ def article(url):
41	51	r = requests.get(url, allow_redirects=True)
42	52	content = r.text
43	53
44		- articleStrImageUrl = articleImage(content)
45		- articleStrTitle = articleTitle(content)
46		- articleStrDescription = articleDescription(content)
	54	+ with open("test.html","w") as f:
	55	+ f.write(content)
	56	+
	57	+ if checkBlock(content) == None:
	58	+ articleStrImageUrl = None
	59	+ articleStrTitle = "No Title"
	60	+ articleStrDescription = "No Description"
	61	+ else:
	62	+ articleStrImageUrl = articleImage(content)
	63	+ articleStrTitle = articleTitle(content)
	64	+ articleStrDescription = articleDescription(content)
47	65
48	66	pageContent = ""
49	67	pageContent += "<meta property=\"og:type\" content=\"article\">\n"
50	68	pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
51	69	pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
52	70	pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
53		- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
54		- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	71	+ if None is not articleStrImageUrl:
	72	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	73	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
55	74
56	75	articleCstBegin = "<article"
57	76	articleCstEnd = "</article>"

...	...	@@ -14,9 +14,11 @@ def article(url):
14	14	article_only=""
15	15
16	16	htmlContent=newsParser.accountMediapart.getArticle(url)
17		- #htmlContent=""
18		- #with open("toto.html") as f:
	17	+ #with open("toto.html","w") as f:
	18	+ # f.write(htmlContent)
	19	+ #with open("toto.html","r") as f:
19	20	# htmlContent=f.read()
	21	+ #print(len(htmlContent))
20	22
21	23	articleStrTitle = ""
22	24	articleStrDescription = ""
...	...	@@ -38,8 +40,6 @@ def article(url):
38	40	articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
39	41	except:
40	42	pass
41		- #with open("toto.html","w") as f:
42		- # f.write(htmlContent)
43	43
44	44	pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
45	45	pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
...	...	@@ -49,10 +49,19 @@ def article(url):
49	49	pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
50	50	pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
51	51	articleElementBegin="<div class=\"news__heading__top"
52		- #articleElementEnd ="<div class=\"news__aside__feedback\">"
53	52	articleElementEnd ="<hr class=\"divider-horizontal"
	53	+ articleElementEnd2 ="<div class=\"news__aside__feedback\">"
	54	+ articleElementEnd3 ="</body>"
54	55	indexElementBegin = htmlContent.index(articleElementBegin)
55		- indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin)
	56	+ try:
	57	+ indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin)
	58	+ except:
	59	+ try:
	60	+ indexElementEnd = htmlContent.index(articleElementEnd2,indexElementBegin)
	61	+ except:
	62	+ indexElementEnd = htmlContent.index(articleElementEnd3,indexElementBegin)
	63	+
	64	+ #print(indexElementEnd)
56	65	article_only = "<div>"+htmlContent[indexElementBegin:indexElementEnd]
57	66	lenBefore=len(article_only)
58	67	say("LengthBefore: "+str(lenBefore))

...	...	@@ -1,48 +0,0 @@
1		-from userio import *
2		-import requests
3		-import re
4		-import newsParser
5		-
6		-def article(url):
7		- say("Article: "+url)
8		- r = requests.get(url, allow_redirects=True)
9		- content = r.text
10		-
11		- articleStrImageUrl = newsParser.articleImage(content)
12		- articleStrTitle = newsParser.articleTitle(content)
13		- articleStrDescription = newsParser.articleDescription(content)
14		-
15		- pageContent = ""
16		- pageContent += "<meta property=\"og:type\" content=\"article\">\n"
17		- pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
18		- pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
19		- pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
20		- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
21		- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
22		-
23		- articleCstBegin = "<section name=\"articleBody\""
24		- articleCstEnd = "</article>"
25		- indexBegin = content.index(articleCstBegin)
26		- indexEnd = content.index(articleCstEnd)
27		-
28		- article_only = ""
29		- article_only += "<h2>"+articleStrTitle+"</h2>\n"
30		- article_only += "<em>"+articleStrDescription+"</em>\n"
31		- article_only += "<img src=\""+articleStrImageUrl+"\">\n"
32		- article_only += content[indexBegin:indexEnd]
33		- article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
34		- article_only = re.sub(r"<h2", '<h3', article_only)
35		- article_only = re.sub(r"</h2>", '</h3>', article_only)
36		- article_only = re.sub(r"<h1", '<h2', article_only)
37		- article_only = re.sub(r"</h1>", '</h2>', article_only)
38		- article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
39		- # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
40		- # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
41		- article_only = re.sub(r"<picture><source media=\"(.?)\" srcSet=\"(.?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
42		- article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
43		- # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only)
44		- article_only = article_only.replace("><", ">\n<")
45		-
46		- article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
47		- pageContent += "<article>"+article_only+"</article>"
48		- return pageContent

...	...	@@ -8,10 +8,26 @@ def article(url):
8	8	r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
9	9	content = r.text
10	10	pageContent = ""
11		- articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\""
12		- articleCstEnd = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">"
13		- indexBegin = content.index(articleCstBegin)
14		- indexEnd = content.index(articleCstEnd)
	11	+ indexBegin=None
	12	+ indexEnd=None
	13	+
	14	+ try:
	15	+ articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\""
	16	+ articleCstEnd = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">"
	17	+ indexBegin = content.index(articleCstBegin)
	18	+ indexEnd = content.index(articleCstEnd)
	19	+ except:
	20	+ pass
	21	+
	22	+ if None == indexBegin:
	23	+ try:
	24	+ articleCstBegin = '<div class="layout layout--onecol">'
	25	+ articleCstEnd = '<div class="field-name-body"'
	26	+ indexBegin = content.index(articleCstBegin)
	27	+ indexEnd = content.index(articleCstEnd)
	28	+ except:
	29	+ pass
	30	+
15	31
16	32	articleStrImageUrl = newsParser.articleImage(content)
17	33	articleStrTitle = newsParser.articleTitle(content)

...	...	@@ -87,7 +87,10 @@ def article(url):
87	87	article_only = re.sub(r'^\s*$', '',article_only,flags=re.M\|re.S)
88	88	article_only = re.sub(r"><",'>\n<',article_only)
89	89
90		- #pageContent += "\n"+article_only+"\n"
	90	+ if "<article" in article_only:
	91	+ #pageContent += "\n"+article_only+"\n"
	92	+ article_only = re.sub(r"<article (.+?)>", '', article_only)
	93	+ article_only = re.sub(r"</article>", '', article_only)
91	94	pageContent += "<article>\n"+article_only+"\n</article>\n"
92	95	lenAfter=len(article_only)
93	96	lenGain=0