... | ... |
@@ -23,6 +23,7 @@ from .newsParser import newsLeFigaro |
23 | 23 |
from .newsParser import newsLeMonde |
24 | 24 |
from .newsParser import newsLeParisien |
25 | 25 |
from .newsParser import newsLiberation |
26 |
+from .newsParser import newsMediapart |
|
26 | 27 |
from .newsParser import newsMidiLibre |
27 | 28 |
from .newsParser import newsMothershipSG |
28 | 29 |
from .newsParser import newsNewYorker |
... | ... |
@@ -44,6 +45,7 @@ from .newsParser import newsWaPo |
44 | 45 |
from .newsParser import newsYahooCom |
45 | 46 |
from .newsParser import newsZDNetFr |
46 | 47 |
# ~ from .newsParser import newsXXXXXX |
48 |
+from .newsParser import accountMediapart |
|
47 | 49 |
|
48 | 50 |
def supportedList(): |
49 | 51 |
current_module = __import__(__name__) |
... | ... |
@@ -209,6 +211,8 @@ def getArticle(url): |
209 | 211 |
data_page += newsParser.newsSlateCom.article(url) |
210 | 212 |
elif "slate.fr" in url: |
211 | 213 |
data_page += newsParser.newsSlateFr.article(url) |
214 |
+ elif "mediapart.fr" in url: |
|
215 |
+ data_page += newsParser.newsMediapart.article(url) |
|
212 | 216 |
else: |
213 | 217 |
data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n" |
214 | 218 |
#data_page += "<p>Supported News:" |
... | ... |
@@ -0,0 +1,47 @@ |
1 |
+import urllib.parse |
|
2 |
+import requests |
|
3 |
+ |
|
4 |
+ |
|
5 |
+name="xxxxxxxxxxx" |
|
6 |
+password="xxxxxxxxxxxx" |
|
7 |
+urlLogin="https://www.mediapart.fr/login_check" |
|
8 |
+ |
|
9 |
+def getUsername(): |
|
10 |
+ return urllib.parse.quote(name) |
|
11 |
+ |
|
12 |
+def getUserpassword(): |
|
13 |
+ return urllib.parse.quote(password) |
|
14 |
+ |
|
15 |
+def getLoginUrl(): |
|
16 |
+ return urlLogin |
|
17 |
+ |
|
18 |
+ |
|
19 |
+req_headers_main = { |
|
20 |
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 Safari/537.36', |
|
21 |
+ 'Origin': 'https://www.mediapart.fr', |
|
22 |
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' |
|
23 |
+} |
|
24 |
+req_headers = { |
|
25 |
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 Safari/537.36', |
|
26 |
+ 'Referer': 'https://www.mediapart.fr/login', |
|
27 |
+ 'Origin': 'https://www.mediapart.fr', |
|
28 |
+ 'Content-Type': 'application/x-www-form-urlencoded', |
|
29 |
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' |
|
30 |
+} |
|
31 |
+formdata = { |
|
32 |
+ 'name': name, |
|
33 |
+ 'password': password, |
|
34 |
+ '_target_path': 'https://www.mediapart.fr', |
|
35 |
+ 'op' : 'Se connecter' |
|
36 |
+} |
|
37 |
+ |
|
38 |
+ |
|
39 |
+def getArticle(url): |
|
40 |
+ # Authenticate |
|
41 |
+ session = requests.session() |
|
42 |
+ r = session.get('https://www.mediapart.fr/login', headers=req_headers_main, allow_redirects=True) |
|
43 |
+ r = session.post(urlLogin, data=formdata, headers=req_headers, cookies=r.cookies, allow_redirects=False) |
|
44 |
+ r2 = session.get(url, headers=req_headers_main, cookies=r.cookies, allow_redirects=True) |
|
45 |
+ return r2.text |
|
46 |
+ |
|
47 |
+ |
... | ... |
@@ -0,0 +1,97 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+import newsParser |
|
5 |
+from requests_html import HTML |
|
6 |
+from requests_html import HTMLSession |
|
7 |
+from bs4 import BeautifulSoup |
|
8 |
+ |
|
9 |
+# Rename accountMediapart.py.skel to accountMediapart.py and fill in account details |
|
10 |
+ |
|
11 |
+def article(url): |
|
12 |
+ say("Article: "+url) |
|
13 |
+ pageContent="" |
|
14 |
+ article_only="" |
|
15 |
+ |
|
16 |
+ htmlContent=newsParser.accountMediapart.getArticle(url) |
|
17 |
+ #htmlContent="" |
|
18 |
+ #with open("toto.html") as f: |
|
19 |
+ # htmlContent=f.read() |
|
20 |
+ |
|
21 |
+ articleStrTitle = "" |
|
22 |
+ articleStrDescription = "" |
|
23 |
+ articleStrImageUrl = "" |
|
24 |
+ articleStrAuthor = "" |
|
25 |
+ try: |
|
26 |
+ articleStrTitle = re.search("<meta property=\"og:title\" content=\"(.+?)\" \/>",htmlContent).group(1) |
|
27 |
+ except: |
|
28 |
+ pass |
|
29 |
+ try: |
|
30 |
+ articleStrDescription = re.search("<meta property=\"og:description\" content=\"(.+?)\" \/>",htmlContent).group(1) |
|
31 |
+ except: |
|
32 |
+ pass |
|
33 |
+ try: |
|
34 |
+ articleStrImageUrl = re.search("<meta property=\"og:image\" content=\"(.+?)\" \/>",htmlContent).group(1) |
|
35 |
+ except: |
|
36 |
+ pass |
|
37 |
+ try: |
|
38 |
+ articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1) |
|
39 |
+ except: |
|
40 |
+ pass |
|
41 |
+ #with open("toto.html","w") as f: |
|
42 |
+ # f.write(htmlContent) |
|
43 |
+ |
|
44 |
+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n" |
|
45 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n" |
|
46 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n" |
|
47 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n" |
|
48 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n" |
|
49 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n" |
|
50 |
+ pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n" |
|
51 |
+ articleElementBegin="<div class=\"news__heading__top" |
|
52 |
+ #articleElementEnd ="<div class=\"news__aside__feedback\">" |
|
53 |
+ articleElementEnd ="<hr class=\"divider-horizontal" |
|
54 |
+ indexElementBegin = htmlContent.index(articleElementBegin) |
|
55 |
+ indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin) |
|
56 |
+ article_only = "<div>"+htmlContent[indexElementBegin:indexElementEnd] |
|
57 |
+ lenBefore=len(article_only) |
|
58 |
+ say("LengthBefore: "+str(lenBefore)) |
|
59 |
+ |
|
60 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
61 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
62 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
63 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
64 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
65 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
66 |
+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S) |
|
67 |
+ article_only = re.sub(r'<script(.+?)/>','',article_only) |
|
68 |
+ article_only = re.sub(r'<button(.+?)</button>','',article_only,flags=re.M|re.S) |
|
69 |
+ article_only = re.sub(r' <div role="tooltip" id="disabled-pdf-notification">(.+?)</div>','',article_only,flags=re.M|re.S) |
|
70 |
+ article_only = re.sub(r'<svg class="icon"(.+?)</svg>','',article_only,flags=re.M|re.S) |
|
71 |
+ article_only = re.sub(r'<span class="is-visually-hidden">Partager(.+?)</span>','',article_only,flags=re.M|re.S) |
|
72 |
+ #article_only = re.sub(r'<a href=\"(.+?)data-smarttag-name="partage_(.+?)"(.+?)data-smarttag-type="action">(.+?)</a>','AAAAA ',article_only,flags=re.M|re.S) |
|
73 |
+ article_only = re.sub(r'<span>Offrir<span class="is-hidden-until-md"> l’article</span>','',article_only) |
|
74 |
+ #article_only = re.sub(r'','',article_only) |
|
75 |
+ article_only = re.sub(r"href=\"/",'href="https://mediapart.fr/',article_only) |
|
76 |
+ article_only = re.sub(r"src=\"/",'src="https://mediapart.fr/',article_only) |
|
77 |
+ article_only = re.sub(r"^$",'',article_only) |
|
78 |
+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S) |
|
79 |
+ article_only = re.sub(r"><",'>\n<',article_only) |
|
80 |
+ |
|
81 |
+ lenAfter=len(article_only) |
|
82 |
+ lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
83 |
+ say("LengthAfter : "+str(lenAfter)) |
|
84 |
+ say("Gain : "+str(lenGain)+"%") |
|
85 |
+ #pageContent += "<article>"+article_only+"</article>" |
|
86 |
+ pageContent += article_only |
|
87 |
+ return pageContent |
|
88 |
+ |
|
89 |
+ |
|
90 |
+ #pageContent += "\n"+article_only+"\n" |
|
91 |
+ pageContent += "<article>\n"+article_only+"\n</article>\n" |
|
92 |
+ lenAfter=len(article_only) |
|
93 |
+ #lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
94 |
+ say("LengthAfter : "+str(lenAfter)) |
|
95 |
+ #say("Gain : "+str(lenGain)+"%") |
|
96 |
+ return pageContent |
|
97 |
+ |