| ... | ... |
@@ -23,6 +23,7 @@ from .newsParser import newsLeFigaro |
| 23 | 23 |
from .newsParser import newsLeMonde |
| 24 | 24 |
from .newsParser import newsLeParisien |
| 25 | 25 |
from .newsParser import newsLiberation |
| 26 |
+from .newsParser import newsMediapart |
|
| 26 | 27 |
from .newsParser import newsMidiLibre |
| 27 | 28 |
from .newsParser import newsMothershipSG |
| 28 | 29 |
from .newsParser import newsNewYorker |
| ... | ... |
@@ -44,6 +45,7 @@ from .newsParser import newsWaPo |
| 44 | 45 |
from .newsParser import newsYahooCom |
| 45 | 46 |
from .newsParser import newsZDNetFr |
| 46 | 47 |
# ~ from .newsParser import newsXXXXXX |
| 48 |
+from .newsParser import accountMediapart |
|
| 47 | 49 |
|
| 48 | 50 |
def supportedList(): |
| 49 | 51 |
current_module = __import__(__name__) |
| ... | ... |
@@ -209,6 +211,8 @@ def getArticle(url): |
| 209 | 211 |
data_page += newsParser.newsSlateCom.article(url) |
| 210 | 212 |
elif "slate.fr" in url: |
| 211 | 213 |
data_page += newsParser.newsSlateFr.article(url) |
| 214 |
+ elif "mediapart.fr" in url: |
|
| 215 |
+ data_page += newsParser.newsMediapart.article(url) |
|
| 212 | 216 |
else: |
| 213 | 217 |
data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n" |
| 214 | 218 |
#data_page += "<p>Supported News:" |
| ... | ... |
@@ -0,0 +1,47 @@ |
| 1 |
+import urllib.parse |
|
| 2 |
+import requests |
|
| 3 |
+ |
|
| 4 |
+ |
|
| 5 |
+name="xxxxxxxxxxx" |
|
| 6 |
+password="xxxxxxxxxxxx" |
|
| 7 |
+urlLogin="https://www.mediapart.fr/login_check" |
|
| 8 |
+ |
|
| 9 |
+def getUsername(): |
|
| 10 |
+ return urllib.parse.quote(name) |
|
| 11 |
+ |
|
| 12 |
+def getUserpassword(): |
|
| 13 |
+ return urllib.parse.quote(password) |
|
| 14 |
+ |
|
| 15 |
+def getLoginUrl(): |
|
| 16 |
+ return urlLogin |
|
| 17 |
+ |
|
| 18 |
+ |
|
| 19 |
+req_headers_main = {
|
|
| 20 |
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 Safari/537.36', |
|
| 21 |
+ 'Origin': 'https://www.mediapart.fr', |
|
| 22 |
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' |
|
| 23 |
+} |
|
| 24 |
+req_headers = {
|
|
| 25 |
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 Safari/537.36', |
|
| 26 |
+ 'Referer': 'https://www.mediapart.fr/login', |
|
| 27 |
+ 'Origin': 'https://www.mediapart.fr', |
|
| 28 |
+ 'Content-Type': 'application/x-www-form-urlencoded', |
|
| 29 |
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' |
|
| 30 |
+} |
|
| 31 |
+formdata = {
|
|
| 32 |
+ 'name': name, |
|
| 33 |
+ 'password': password, |
|
| 34 |
+ '_target_path': 'https://www.mediapart.fr', |
|
| 35 |
+ 'op' : 'Se connecter' |
|
| 36 |
+} |
|
| 37 |
+ |
|
| 38 |
+ |
|
| 39 |
+def getArticle(url): |
|
| 40 |
+ # Authenticate |
|
| 41 |
+ session = requests.session() |
|
| 42 |
+ r = session.get('https://www.mediapart.fr/login', headers=req_headers_main, allow_redirects=True)
|
|
| 43 |
+ r = session.post(urlLogin, data=formdata, headers=req_headers, cookies=r.cookies, allow_redirects=False) |
|
| 44 |
+ r2 = session.get(url, headers=req_headers_main, cookies=r.cookies, allow_redirects=True) |
|
| 45 |
+ return r2.text |
|
| 46 |
+ |
|
| 47 |
+ |
| ... | ... |
@@ -0,0 +1,97 @@ |
| 1 |
+from userio import * |
|
| 2 |
+import requests |
|
| 3 |
+import re |
|
| 4 |
+import newsParser |
|
| 5 |
+from requests_html import HTML |
|
| 6 |
+from requests_html import HTMLSession |
|
| 7 |
+from bs4 import BeautifulSoup |
|
| 8 |
+ |
|
| 9 |
+# Rename accountMediapart.py.skel to accountMediapart.py and fill in account details |
|
| 10 |
+ |
|
| 11 |
+def article(url): |
|
| 12 |
+ say("Article: "+url)
|
|
| 13 |
+ pageContent="" |
|
| 14 |
+ article_only="" |
|
| 15 |
+ |
|
| 16 |
+ htmlContent=newsParser.accountMediapart.getArticle(url) |
|
| 17 |
+ #htmlContent="" |
|
| 18 |
+ #with open("toto.html") as f:
|
|
| 19 |
+ # htmlContent=f.read() |
|
| 20 |
+ |
|
| 21 |
+ articleStrTitle = "" |
|
| 22 |
+ articleStrDescription = "" |
|
| 23 |
+ articleStrImageUrl = "" |
|
| 24 |
+ articleStrAuthor = "" |
|
| 25 |
+ try: |
|
| 26 |
+ articleStrTitle = re.search("<meta property=\"og:title\" content=\"(.+?)\" \/>",htmlContent).group(1)
|
|
| 27 |
+ except: |
|
| 28 |
+ pass |
|
| 29 |
+ try: |
|
| 30 |
+ articleStrDescription = re.search("<meta property=\"og:description\" content=\"(.+?)\" \/>",htmlContent).group(1)
|
|
| 31 |
+ except: |
|
| 32 |
+ pass |
|
| 33 |
+ try: |
|
| 34 |
+ articleStrImageUrl = re.search("<meta property=\"og:image\" content=\"(.+?)\" \/>",htmlContent).group(1)
|
|
| 35 |
+ except: |
|
| 36 |
+ pass |
|
| 37 |
+ try: |
|
| 38 |
+ articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
|
|
| 39 |
+ except: |
|
| 40 |
+ pass |
|
| 41 |
+ #with open("toto.html","w") as f:
|
|
| 42 |
+ # f.write(htmlContent) |
|
| 43 |
+ |
|
| 44 |
+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n" |
|
| 45 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n" |
|
| 46 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n" |
|
| 47 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n" |
|
| 48 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n" |
|
| 49 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n" |
|
| 50 |
+ pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n" |
|
| 51 |
+ articleElementBegin="<div class=\"news__heading__top" |
|
| 52 |
+ #articleElementEnd ="<div class=\"news__aside__feedback\">" |
|
| 53 |
+ articleElementEnd ="<hr class=\"divider-horizontal" |
|
| 54 |
+ indexElementBegin = htmlContent.index(articleElementBegin) |
|
| 55 |
+ indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin) |
|
| 56 |
+ article_only = "<div>"+htmlContent[indexElementBegin:indexElementEnd] |
|
| 57 |
+ lenBefore=len(article_only) |
|
| 58 |
+ say("LengthBefore: "+str(lenBefore))
|
|
| 59 |
+ |
|
| 60 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
| 61 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
| 62 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
| 63 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
| 64 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
| 65 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
| 66 |
+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S) |
|
| 67 |
+ article_only = re.sub(r'<script(.+?)/>','',article_only) |
|
| 68 |
+ article_only = re.sub(r'<button(.+?)</button>','',article_only,flags=re.M|re.S) |
|
| 69 |
+ article_only = re.sub(r' <div role="tooltip" id="disabled-pdf-notification">(.+?)</div>','',article_only,flags=re.M|re.S) |
|
| 70 |
+ article_only = re.sub(r'<svg class="icon"(.+?)</svg>','',article_only,flags=re.M|re.S) |
|
| 71 |
+ article_only = re.sub(r'<span class="is-visually-hidden">Partager(.+?)</span>','',article_only,flags=re.M|re.S) |
|
| 72 |
+ #article_only = re.sub(r'<a href=\"(.+?)data-smarttag-name="partage_(.+?)"(.+?)data-smarttag-type="action">(.+?)</a>','AAAAA ',article_only,flags=re.M|re.S) |
|
| 73 |
+ article_only = re.sub(r'<span>Offrir<span class="is-hidden-until-md"> l’article</span>','',article_only) |
|
| 74 |
+ #article_only = re.sub(r'','',article_only) |
|
| 75 |
+ article_only = re.sub(r"href=\"/",'href="https://mediapart.fr/',article_only) |
|
| 76 |
+ article_only = re.sub(r"src=\"/",'src="https://mediapart.fr/',article_only) |
|
| 77 |
+ article_only = re.sub(r"^$",'',article_only) |
|
| 78 |
+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S) |
|
| 79 |
+ article_only = re.sub(r"><",'>\n<',article_only) |
|
| 80 |
+ |
|
| 81 |
+ lenAfter=len(article_only) |
|
| 82 |
+ lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
| 83 |
+ say("LengthAfter : "+str(lenAfter))
|
|
| 84 |
+ say("Gain : "+str(lenGain)+"%")
|
|
| 85 |
+ #pageContent += "<article>"+article_only+"</article>" |
|
| 86 |
+ pageContent += article_only |
|
| 87 |
+ return pageContent |
|
| 88 |
+ |
|
| 89 |
+ |
|
| 90 |
+ #pageContent += "\n"+article_only+"\n" |
|
| 91 |
+ pageContent += "<article>\n"+article_only+"\n</article>\n" |
|
| 92 |
+ lenAfter=len(article_only) |
|
| 93 |
+ #lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 |
|
| 94 |
+ say("LengthAfter : "+str(lenAfter))
|
|
| 95 |
+ #say("Gain : "+str(lenGain)+"%")
|
|
| 96 |
+ return pageContent |
|
| 97 |
+ |