Showing 3 changed files with 148 additions and 0 deletions
+4
newsParser/__init__.py
... ...
@@ -23,6 +23,7 @@ from .newsParser import newsLeFigaro
23 23
 from .newsParser import newsLeMonde
24 24
 from .newsParser import newsLeParisien
25 25
 from .newsParser import newsLiberation
26
+from .newsParser import newsMediapart
26 27
 from .newsParser import newsMidiLibre
27 28
 from .newsParser import newsMothershipSG
28 29
 from .newsParser import newsNewYorker
... ...
@@ -44,6 +45,7 @@ from .newsParser import newsWaPo
44 45
 from .newsParser import newsYahooCom
45 46
 from .newsParser import newsZDNetFr
46 47
 # ~ from .newsParser import newsXXXXXX
48
+from .newsParser import accountMediapart
47 49
 
48 50
 def supportedList():
49 51
   current_module = __import__(__name__)
... ...
@@ -209,6 +211,8 @@ def getArticle(url):
209 211
       data_page += newsParser.newsSlateCom.article(url)
210 212
     elif "slate.fr" in url:
211 213
       data_page += newsParser.newsSlateFr.article(url)
214
+    elif "mediapart.fr" in url:
215
+      data_page += newsParser.newsMediapart.article(url)
212 216
     else:
213 217
        data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
214 218
        #data_page += "<p>Supported News:"
+47
newsParser/newsParser/accountMediapart.py.skel
... ...
@@ -0,0 +1,47 @@
1
+import urllib.parse
2
+import requests
3
+
4
+
5
+name="xxxxxxxxxxx"
6
+password="xxxxxxxxxxxx"
7
+urlLogin="https://www.mediapart.fr/login_check"
8
+
9
+def getUsername():
10
+  return urllib.parse.quote(name)
11
+
12
+def getUserpassword():
13
+  return urllib.parse.quote(password)
14
+
15
+def getLoginUrl():
16
+  return urlLogin
17
+
18
+
19
+req_headers_main = {
20
+  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 Safari/537.36',
21
+  'Origin': 'https://www.mediapart.fr',
22
+  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
23
+}
24
+req_headers = {
25
+  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 Safari/537.36',
26
+  'Referer': 'https://www.mediapart.fr/login',
27
+  'Origin': 'https://www.mediapart.fr',
28
+  'Content-Type': 'application/x-www-form-urlencoded',
29
+  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
30
+}
31
+formdata = {
32
+    'name': name,
33
+    'password': password,
34
+    '_target_path': 'https://www.mediapart.fr',
35
+    'op' : 'Se connecter'
36
+}
37
+
38
+
39
+def getArticle(url):
40
+  # Authenticate
41
+  session = requests.session()
42
+  r = session.get('https://www.mediapart.fr/login', headers=req_headers_main, allow_redirects=True)
43
+  r = session.post(urlLogin, data=formdata, headers=req_headers, cookies=r.cookies, allow_redirects=False)
44
+  r2 = session.get(url, headers=req_headers_main, cookies=r.cookies, allow_redirects=True)
45
+  return r2.text
46
+
47
+
+97
newsParser/newsParser/newsMediapart.py
... ...
@@ -0,0 +1,97 @@
1
+from userio import *
2
+import requests
3
+import re
4
+import newsParser
5
+from requests_html import HTML
6
+from requests_html import HTMLSession
7
+from bs4 import BeautifulSoup
8
+
9
+# Rename accountMediapart.py.skel to accountMediapart.py and fill in account details
10
+
11
+def article(url):
12
+  say("Article: "+url)
13
+  pageContent=""
14
+  article_only=""
15
+
16
+  htmlContent=newsParser.accountMediapart.getArticle(url)
17
+  #htmlContent=""
18
+  #with open("toto.html") as f:
19
+  #  htmlContent=f.read()
20
+
21
+  articleStrTitle = ""
22
+  articleStrDescription = ""
23
+  articleStrImageUrl = ""
24
+  articleStrAuthor = ""
25
+  try:
26
+    articleStrTitle = re.search("<meta property=\"og:title\" content=\"(.+?)\" \/>",htmlContent).group(1)
27
+  except:
28
+    pass
29
+  try:
30
+    articleStrDescription = re.search("<meta property=\"og:description\" content=\"(.+?)\" \/>",htmlContent).group(1)
31
+  except:
32
+    pass
33
+  try:
34
+    articleStrImageUrl = re.search("<meta property=\"og:image\" content=\"(.+?)\" \/>",htmlContent).group(1)
35
+  except:
36
+    pass
37
+  try:
38
+    articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
39
+  except:
40
+    pass
41
+  #with open("toto.html","w") as f:
42
+  #  f.write(htmlContent)
43
+
44
+  pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
45
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
46
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n"
47
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n"
48
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n"
49
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
50
+  pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
51
+  articleElementBegin="<div class=\"news__heading__top"
52
+  #articleElementEnd  ="<div class=\"news__aside__feedback\">"
53
+  articleElementEnd  ="<hr class=\"divider-horizontal"
54
+  indexElementBegin  = htmlContent.index(articleElementBegin)
55
+  indexElementEnd    = htmlContent.index(articleElementEnd,indexElementBegin)
56
+  article_only = "<div>"+htmlContent[indexElementBegin:indexElementEnd]
57
+  lenBefore=len(article_only)
58
+  say("LengthBefore: "+str(lenBefore))
59
+
60
+  article_only = re.sub(r"<amp-img", '<img', article_only)
61
+  article_only = re.sub(r"</amp-img>", '', article_only)
62
+  article_only = re.sub(r"<h2", '<h3', article_only)
63
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
64
+  article_only = re.sub(r"<h1", '<h2', article_only)
65
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
66
+  article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S)
67
+  article_only = re.sub(r'<script(.+?)/>','',article_only)
68
+  article_only = re.sub(r'<button(.+?)</button>','',article_only,flags=re.M|re.S)
69
+  article_only = re.sub(r' <div role="tooltip" id="disabled-pdf-notification">(.+?)</div>','',article_only,flags=re.M|re.S)
70
+  article_only = re.sub(r'<svg class="icon"(.+?)</svg>','',article_only,flags=re.M|re.S)
71
+  article_only = re.sub(r'<span class="is-visually-hidden">Partager(.+?)</span>','',article_only,flags=re.M|re.S)
72
+  #article_only = re.sub(r'<a href=\"(.+?)data-smarttag-name="partage_(.+?)"(.+?)data-smarttag-type="action">(.+?)</a>','AAAAA ',article_only,flags=re.M|re.S)
73
+  article_only = re.sub(r'<span>Offrir<span class="is-hidden-until-md"> l’article</span>','',article_only)
74
+  #article_only = re.sub(r'','',article_only)
75
+  article_only = re.sub(r"href=\"/",'href="https://mediapart.fr/',article_only)
76
+  article_only = re.sub(r"src=\"/",'src="https://mediapart.fr/',article_only)
77
+  article_only = re.sub(r"^$",'',article_only)
78
+  article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S)
79
+  article_only = re.sub(r"><",'>\n<',article_only)
80
+
81
+  lenAfter=len(article_only)
82
+  lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100
83
+  say("LengthAfter : "+str(lenAfter))
84
+  say("Gain        : "+str(lenGain)+"%")
85
+  #pageContent += "<article>"+article_only+"</article>"
86
+  pageContent += article_only
87
+  return pageContent 
88
+
89
+
90
+  #pageContent += "\n"+article_only+"\n"
91
+  pageContent += "<article>\n"+article_only+"\n</article>\n"
92
+  lenAfter=len(article_only)
93
+  #lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100
94
+  say("LengthAfter : "+str(lenAfter))
95
+  #say("Gain        : "+str(lenGain)+"%")
96
+  return pageContent
97
+