Showing 8 changed files with 231 additions and 71 deletions
+4 -1
newsParser/__init__.py
... ...
@@ -12,6 +12,7 @@ from .newsParser import newsBondyBlog
12 12
 from .newsParser import newsBuzzfeedCom
13 13
 from .newsParser import newsChallengesFr
14 14
 from .newsParser import newsCNA
15
+from .newsParser import newsCourrier
15 16
 from .newsParser import newsDefault
16 17
 from .newsParser import newsDNA
17 18
 from .newsParser import newsFranceTVInfo
... ...
@@ -27,7 +28,6 @@ from .newsParser import newsMediapart
27 28
 from .newsParser import newsMidiLibre
28 29
 from .newsParser import newsMothershipSG
29 30
 from .newsParser import newsNewYorker
30
-from .newsParser import newsNewYorkTimes
31 31
 from .newsParser import newsNouvelObs
32 32
 from .newsParser import newsNSTMy
33 33
 from .newsParser import newsSCMP
... ...
@@ -46,6 +46,7 @@ from .newsParser import newsYahooCom
46 46
 from .newsParser import newsZDNetFr
47 47
 # ~ from .newsParser import newsXXXXXX
48 48
 from .newsParser import accountMediapart
49
+from .newsParser import accountCourrier
49 50
 
50 51
 def supportedList():
51 52
   current_module = __import__(__name__)
... ...
@@ -213,6 +214,8 @@ def getArticle(url):
213 214
       data_page += newsParser.newsSlateFr.article(url)
214 215
     elif "mediapart.fr" in url:
215 216
       data_page += newsParser.newsMediapart.article(url)
217
+    elif "courrierinternational.com" in url:
218
+      data_page += newsParser.newsCourrier.article(url)
216 219
     else:
217 220
        data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
218 221
        #data_page += "<p>Supported News:"
+98
newsParser/newsParser/newsCourrier.py
... ...
@@ -0,0 +1,98 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+from requests_html import HTML
6
+from requests_html import HTMLSession
7
+from bs4 import BeautifulSoup
8
+
9
+def article(url):
10
+  say("Article: "+url)
11
+  pageContent=""
12
+  article_only=""
13
+
14
+  htmlContent=newsParser.accountCourrier.getArticle(url)
15
+  #htmlContent=""
16
+  #with open("toto.html") as f:
17
+  #  htmlContent=f.read()
18
+
19
+  articleStrTitle = ""
20
+  articleStrDescription = ""
21
+  articleStrImageUrl = ""
22
+  articleStrAuthor = ""
23
+  try:
24
+    articleStrTitle = re.search("<meta property=\"og:title\" content=\"(.+?)\" \/>",htmlContent).group(1)
25
+  except:
26
+    pass
27
+  try:
28
+    articleStrDescription = re.search("<meta property=\"og:description\" content=\"(.+?)\" \/>",htmlContent).group(1)
29
+  except:
30
+    pass
31
+  try:
32
+    articleStrImageUrl = re.search("<meta property=\"og:image\" content=\"(.+?)\" \/>",htmlContent).group(1)
33
+  except:
34
+    pass
35
+  try:
36
+    articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
37
+  except:
38
+    pass
39
+  with open("toto.html","w") as f:
40
+    f.write(htmlContent)
41
+  print("Title:"+articleStrTitle)
42
+
43
+  pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
44
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
45
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n"
46
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n"
47
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n"
48
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
49
+  pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
50
+  articleElementBegin="<article"
51
+  #articleElementEnd  ="<div class=\"news__aside__feedback\">"
52
+  articleElementEnd  ="</article>"
53
+  print("Start: "+articleElementBegin)
54
+  print("End  : "+articleElementEnd)
55
+  indexElementBegin  = htmlContent.index(articleElementBegin)
56
+  indexElementEnd    = htmlContent.index(articleElementEnd,indexElementBegin)+len(articleElementEnd)
57
+  article_only = htmlContent[indexElementBegin:indexElementEnd]
58
+  lenBefore=len(article_only)
59
+  say("LengthBefore: "+str(lenBefore))
60
+
61
+  article_only = re.sub(r"<amp-img", '<img', article_only)
62
+  article_only = re.sub(r"</amp-img>", '', article_only)
63
+  article_only = re.sub(r"<h2", '<h3', article_only)
64
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
65
+  article_only = re.sub(r"<h1", '<h2', article_only)
66
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
67
+  article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S)
68
+  article_only = re.sub(r'<script(.+?)/>','',article_only)
69
+  article_only = re.sub(r'<button(.+?)</button>','',article_only,flags=re.M|re.S)
70
+  article_only = re.sub(r' <div role="tooltip" id="disabled-pdf-notification">(.+?)</div>','',article_only,flags=re.M|re.S)
71
+  article_only = re.sub(r'<svg class="icon"(.+?)</svg>','',article_only,flags=re.M|re.S)
72
+  article_only = re.sub(r'<span class="is-visually-hidden">Partager(.+?)</span>','',article_only,flags=re.M|re.S)
73
+  #article_only = re.sub(r'<a href=\"(.+?)data-smarttag-name="partage_(.+?)"(.+?)data-smarttag-type="action">(.+?)</a>','AAAAA ',article_only,flags=re.M|re.S)
74
+  article_only = re.sub(r'<span>Offrir<span class="is-hidden-until-md"> l’article</span>','',article_only)
75
+  #article_only = re.sub(r'','',article_only)
76
+  article_only = re.sub(r"href=\"/",'href="https://mediapart.fr/',article_only)
77
+  article_only = re.sub(r"src=\"/",'src="https://mediapart.fr/',article_only)
78
+  article_only = re.sub(r"^$",'',article_only)
79
+  article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S)
80
+  article_only = re.sub(r"><",'>\n<',article_only)
81
+
82
+  lenAfter=len(article_only)
83
+  lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100
84
+  say("LengthAfter : "+str(lenAfter))
85
+  say("Gain        : "+str(lenGain)+"%")
86
+  #pageContent += "<article>"+article_only+"</article>"
87
+  pageContent += article_only
88
+  return pageContent 
89
+
90
+
91
+  #pageContent += "\n"+article_only+"\n"
92
+  pageContent += "<article>\n"+article_only+"\n</article>\n"
93
+  lenAfter=len(article_only)
94
+  #lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100
95
+  say("LengthAfter : "+str(lenAfter))
96
+  #say("Gain        : "+str(lenGain)+"%")
97
+  return pageContent
98
+
+30 -11
newsParser/newsParser/newsDefault.py
... ...
@@ -5,9 +5,9 @@ import re
5 5
 def articleImage(content):
6 6
   articleImgBegin ="<meta property=\"og:image\" content=\""
7 7
   articleImgEnd   ="\""
8
-  indexImgBegin = content.index(articleImgBegin)
9
-  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
10 8
   try:
9
+    indexImgBegin = content.index(articleImgBegin)
10
+    indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
11 11
     image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12 12
   except:
13 13
     image = "favicon.png"
... ...
@@ -16,21 +16,31 @@ def articleImage(content):
16 16
 def articleDescription(content):
17 17
   articleImgBegin ="<meta property=\"og:description\" content=\""
18 18
   articleImgEnd   ="\""
19
-  indexImgBegin = content.index(articleImgBegin)
20
-  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
21 19
   try:
20
+    indexImgBegin = content.index(articleImgBegin)
21
+    indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
22 22
     title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
23 23
   except:
24 24
     title = "Description Extraction Failed"
25 25
   return title
26 26
 
27
+
28
+def checkBlock(content):
29
+  blockString="Attention Required! | Cloudflare"
30
+  try:
31
+    indexBlock = content.index(blockString)
32
+  except:
33
+    indexBlock = None
34
+  say("indexBlock: "+str(indexBlock))
35
+  return indexBlock
36
+
27 37
 def articleTitle(content):
28 38
   #articleImgBegin ="<meta property=\"og:title\" content=\""
29 39
   articleImgBegin ="\"og:title\" content=\""
30 40
   articleImgEnd   ="\""
31
-  indexImgBegin = content.index(articleImgBegin)
32
-  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
33 41
   try:
42
+    indexImgBegin = content.index(articleImgBegin)
43
+    indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
34 44
     title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
35 45
   except:
36 46
     title = "Title Extraction Failed"
... ...
@@ -41,17 +51,26 @@ def article(url):
41 51
   r = requests.get(url, allow_redirects=True)
42 52
   content = r.text
43 53
 
44
-  articleStrImageUrl = articleImage(content)
45
-  articleStrTitle = articleTitle(content)
46
-  articleStrDescription = articleDescription(content)
54
+  with open("test.html","w") as f:
55
+    f.write(content)
56
+    
57
+  if checkBlock(content) == None:
58
+    articleStrImageUrl = None
59
+    articleStrTitle = "No Title"
60
+    articleStrDescription = "No Description"
61
+  else:
62
+    articleStrImageUrl = articleImage(content)
63
+    articleStrTitle = articleTitle(content)
64
+    articleStrDescription = articleDescription(content)
47 65
   
48 66
   pageContent = ""
49 67
   pageContent += "<meta property=\"og:type\" content=\"article\">\n"
50 68
   pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
51 69
   pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
52 70
   pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
53
-  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
54
-  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
71
+  if None is not articleStrImageUrl:
72
+    pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
73
+    pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
55 74
   
56 75
   articleCstBegin = "<article"
57 76
   articleCstEnd   = "</article>"
+15 -6
newsParser/newsParser/newsMediapart.py
... ...
@@ -14,9 +14,11 @@ def article(url):
14 14
   article_only=""
15 15
 
16 16
   htmlContent=newsParser.accountMediapart.getArticle(url)
17
-  #htmlContent=""
18
-  #with open("toto.html") as f:
17
+  #with open("toto.html","w") as f:
18
+  #  f.write(htmlContent)
19
+  #with open("toto.html","r") as f:
19 20
   #  htmlContent=f.read()
21
+  #print(len(htmlContent))
20 22
 
21 23
   articleStrTitle = ""
22 24
   articleStrDescription = ""
... ...
@@ -38,8 +40,6 @@ def article(url):
38 40
     articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
39 41
   except:
40 42
     pass
41
-  #with open("toto.html","w") as f:
42
-  #  f.write(htmlContent)
43 43
 
44 44
   pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
45 45
   pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
... ...
@@ -49,10 +49,19 @@ def article(url):
49 49
   pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
50 50
   pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
51 51
   articleElementBegin="<div class=\"news__heading__top"
52
-  #articleElementEnd  ="<div class=\"news__aside__feedback\">"
53 52
   articleElementEnd  ="<hr class=\"divider-horizontal"
53
+  articleElementEnd2  ="<div class=\"news__aside__feedback\">"
54
+  articleElementEnd3  ="</body>"
54 55
   indexElementBegin  = htmlContent.index(articleElementBegin)
55
-  indexElementEnd    = htmlContent.index(articleElementEnd,indexElementBegin)
56
+  try:
57
+    indexElementEnd    = htmlContent.index(articleElementEnd,indexElementBegin)
58
+  except:
59
+    try:
60
+      indexElementEnd    = htmlContent.index(articleElementEnd2,indexElementBegin)
61
+    except:
62
+      indexElementEnd    = htmlContent.index(articleElementEnd3,indexElementBegin)
63
+    
64
+  #print(indexElementEnd)
56 65
   article_only = "<div>"+htmlContent[indexElementBegin:indexElementEnd]
57 66
   lenBefore=len(article_only)
58 67
   say("LengthBefore: "+str(lenBefore))
+60
newsParser/newsParser/newsNYT.py
... ...
@@ -0,0 +1,60 @@
1
+# ~ from userio import *
2
+import userio
3
+import requests
4
+import re
5
+import newsParser
6
+from requests_html import HTML
7
+from requests_html import HTMLSession
8
+
9
+
10
+def article(url):
11
+  say("Article: "+url)
12
+  session = HTMLSession()
13
+  response = session.get(url,timeout=20)
14
+  pageContent=""
15
+  article_only=""
16
+  with response as r:
17
+    articleStrTitle = r.html.xpath('//meta[@property="og:title"]/@content')[0]
18
+    articleStrDescription = r.html.xpath('//meta[@property="og:description"]/@content')[0]
19
+    articleStrImageUrl = r.html.xpath('//meta[@property="og:image"]/@content')[0]
20
+    articleStrAuthor = r.html.xpath('//div[@class="author_wrapper"]/@content')
21
+    article=r.html.find("main")[0]
22
+    article_only+=article.html
23
+    lenBefore=len(article_only)
24
+    say("LengthBefore: "+str(lenBefore))
25
+    pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
26
+    pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
27
+    pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n"
28
+    pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n"
29
+    pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n"
30
+    pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
31
+    #pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
32
+
33
+  article_only = re.sub(r"<amp-img", '<img', article_only)
34
+  article_only = re.sub(r"</amp-img>", '', article_only)
35
+  article_only = re.sub(r"<h2", '<h3', article_only)
36
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
37
+  article_only = re.sub(r"<h1", '<h2', article_only)
38
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
39
+  article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S)
40
+  article_only = re.sub(r'<script(.+?)/>','',article_only)
41
+  #article_only = re.sub(r'','',article_only)
42
+  article_only = re.sub(r"href=\"/",'href="https://xxxxx/',article_only)
43
+  article_only = re.sub(r"src=\"/",'src="https://xxxxx/',article_only)
44
+  article_only = re.sub(r"^$",'',article_only)
45
+  article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S)
46
+  article_only = re.sub(r"><",'>\n<',article_only)
47
+
48
+  #pageContent += "\n"+article_only+"\n"
49
+  pageContent += "<article>\n"+article_only+"\n</article>\n"
50
+  lenAfter=len(article_only)
51
+  lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100
52
+  say("LengthAfter : "+str(lenAfter))
53
+  say("Gain        : "+str(lenGain)+"%")
54
+  return pageContent
55
+
56
+if __name__ == "__main__":        
57
+  #global server
58
+  url="https://www.nytimes.com/2023/03/05/technology/artificial-intelligence-breast-cancer-detection.html"
59
+  print(url)
60
+  article(url)
-48
newsParser/newsParser/newsNewYorkTimes.py
... ...
@@ -1,48 +0,0 @@
1
-from userio import *
2
-import requests
3
-import re
4
-import newsParser
5
-  
6
-def article(url):
7
-  say("Article: "+url)
8
-  r = requests.get(url, allow_redirects=True)
9
-  content = r.text
10
-
11
-  articleStrImageUrl = newsParser.articleImage(content)
12
-  articleStrTitle = newsParser.articleTitle(content)
13
-  articleStrDescription = newsParser.articleDescription(content)
14
-  
15
-  pageContent = ""
16
-  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
17
-  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
18
-  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
19
-  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
20
-  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
21
-  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
22
-  
23
-  articleCstBegin = "<section name=\"articleBody\""
24
-  articleCstEnd   = "</article>"
25
-  indexBegin = content.index(articleCstBegin)
26
-  indexEnd   = content.index(articleCstEnd)
27
-  
28
-  article_only = ""
29
-  article_only += "<h2>"+articleStrTitle+"</h2>\n"
30
-  article_only += "<em>"+articleStrDescription+"</em>\n"
31
-  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
32
-  article_only += content[indexBegin:indexEnd]
33
-  article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
34
-  article_only = re.sub(r"<h2", '<h3', article_only)
35
-  article_only = re.sub(r"</h2>", '</h3>', article_only)
36
-  article_only = re.sub(r"<h1", '<h2', article_only)
37
-  article_only = re.sub(r"</h1>", '</h2>', article_only)
38
-  article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
39
-  # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
40
-  # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
41
-  article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
42
-  article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
43
-  # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only)
44
-  article_only = article_only.replace("><", ">\n<")
45
-  
46
-  article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
47
-  pageContent += "<article>"+article_only+"</article>"
48
-  return pageContent
+20 -4
newsParser/newsParser/newsStraitsTimes.py
... ...
@@ -8,10 +8,26 @@ def article(url):
8 8
   r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
9 9
   content = r.text
10 10
   pageContent = ""
11
-  articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\""
12
-  articleCstEnd   = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">"
13
-  indexBegin = content.index(articleCstBegin)
14
-  indexEnd   = content.index(articleCstEnd)
11
+  indexBegin=None
12
+  indexEnd=None
13
+
14
+  try:
15
+    articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\""
16
+    articleCstEnd   = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">"
17
+    indexBegin = content.index(articleCstBegin)
18
+    indexEnd   = content.index(articleCstEnd)
19
+  except:
20
+    pass
21
+
22
+  if None == indexBegin:
23
+    try:
24
+      articleCstBegin = '<div class="layout layout--onecol">'
25
+      articleCstEnd   = '<div class="field-name-body"'
26
+      indexBegin = content.index(articleCstBegin)
27
+      indexEnd   = content.index(articleCstEnd)
28
+    except:
29
+      pass
30
+
15 31
 
16 32
   articleStrImageUrl = newsParser.articleImage(content)
17 33
   articleStrTitle = newsParser.articleTitle(content)
+4 -1
newsParser/newsParser/newsWaPo.py
... ...
@@ -87,7 +87,10 @@ def article(url):
87 87
   article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S)
88 88
   article_only = re.sub(r"><",'>\n<',article_only)
89 89
 
90
-  #pageContent += "\n"+article_only+"\n"
90
+  if "<article" in article_only:
91
+    #pageContent += "\n"+article_only+"\n"
92
+    article_only = re.sub(r"<article (.+?)>", '', article_only)
93
+    article_only = re.sub(r"</article>", '', article_only)
91 94
   pageContent += "<article>\n"+article_only+"\n</article>\n"
92 95
   lenAfter=len(article_only)
93 96
   lenGain=0