Added mediapart ・ 1439b9a ・ Gitprep

- Added mediapart;
- Browse files
- ycawidro commited on 2022-08-02
- 1 parent a066ec8
  
  commit 1439b9a465332f0bb45a77555e03181484952d4e

Showing 3 changed files with 148 additions and 0 deletions

newsParser/__init__.py

@@ -23,6 +23,7 @@ from .newsParser import newsLeFigaro
 from .newsParser import newsLeMonde
 from .newsParser import newsLeParisien
 from .newsParser import newsLiberation
+from .newsParser import newsMediapart
 from .newsParser import newsMidiLibre
 from .newsParser import newsMothershipSG
 from .newsParser import newsNewYorker
@@ -44,6 +45,7 @@ from .newsParser import newsWaPo
 from .newsParser import newsYahooCom
 from .newsParser import newsZDNetFr
 # ~ from .newsParser import newsXXXXXX
+from .newsParser import accountMediapart
 
 def supportedList():
   current_module = __import__(__name__)
@@ -209,6 +211,8 @@ def getArticle(url):
       data_page += newsParser.newsSlateCom.article(url)
     elif "slate.fr" in url:
       data_page += newsParser.newsSlateFr.article(url)
+    elif "mediapart.fr" in url:
+      data_page += newsParser.newsMediapart.article(url)
     else:
        data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
        #data_page += "<p>Supported News:"

+47

newsParser/newsParser/accountMediapart.py.skel

View

...	...	@@ -0,0 +1,47 @@
	1	+import urllib.parse
	2	+import requests
	3	+
	4	+
	5	+name="xxxxxxxxxxx"
	6	+password="xxxxxxxxxxxx"
	7	+urlLogin="https://www.mediapart.fr/login_check"
	8	+
	9	+def getUsername():
	10	+ return urllib.parse.quote(name)
	11	+
	12	+def getUserpassword():
	13	+ return urllib.parse.quote(password)
	14	+
	15	+def getLoginUrl():
	16	+ return urlLogin
	17	+
	18	+
	19	+req_headers_main = {
	20	+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 Safari/537.36',
	21	+ 'Origin': 'https://www.mediapart.fr',
	22	+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9'
	23	+}
	24	+req_headers = {
	25	+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 Safari/537.36',
	26	+ 'Referer': 'https://www.mediapart.fr/login',
	27	+ 'Origin': 'https://www.mediapart.fr',
	28	+ 'Content-Type': 'application/x-www-form-urlencoded',
	29	+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9'
	30	+}
	31	+formdata = {
	32	+ 'name': name,
	33	+ 'password': password,
	34	+ '_target_path': 'https://www.mediapart.fr',
	35	+ 'op' : 'Se connecter'
	36	+}
	37	+
	38	+
	39	+def getArticle(url):
	40	+ # Authenticate
	41	+ session = requests.session()
	42	+ r = session.get('https://www.mediapart.fr/login', headers=req_headers_main, allow_redirects=True)
	43	+ r = session.post(urlLogin, data=formdata, headers=req_headers, cookies=r.cookies, allow_redirects=False)
	44	+ r2 = session.get(url, headers=req_headers_main, cookies=r.cookies, allow_redirects=True)
	45	+ return r2.text
	46	+
	47	+

+97

newsParser/newsParser/newsMediapart.py

View

...	...	@@ -0,0 +1,97 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+from requests_html import HTML
	6	+from requests_html import HTMLSession
	7	+from bs4 import BeautifulSoup
	8	+
	9	+# Rename accountMediapart.py.skel to accountMediapart.py and fill in account details
	10	+
	11	+def article(url):
	12	+ say("Article: "+url)
	13	+ pageContent=""
	14	+ article_only=""
	15	+
	16	+ htmlContent=newsParser.accountMediapart.getArticle(url)
	17	+ #htmlContent=""
	18	+ #with open("toto.html") as f:
	19	+ # htmlContent=f.read()
	20	+
	21	+ articleStrTitle = ""
	22	+ articleStrDescription = ""
	23	+ articleStrImageUrl = ""
	24	+ articleStrAuthor = ""
	25	+ try:
	26	+ articleStrTitle = re.search("<meta property=\"og:title\" content=\"(.+?)\" \/>",htmlContent).group(1)
	27	+ except:
	28	+ pass
	29	+ try:
	30	+ articleStrDescription = re.search("<meta property=\"og:description\" content=\"(.+?)\" \/>",htmlContent).group(1)
	31	+ except:
	32	+ pass
	33	+ try:
	34	+ articleStrImageUrl = re.search("<meta property=\"og:image\" content=\"(.+?)\" \/>",htmlContent).group(1)
	35	+ except:
	36	+ pass
	37	+ try:
	38	+ articleStrAuthor = re.search("<meta name=\"author\" content=\"(.+?)\" />",htmlContent).group(1)
	39	+ except:
	40	+ pass
	41	+ #with open("toto.html","w") as f:
	42	+ # f.write(htmlContent)
	43	+
	44	+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
	45	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
	46	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n"
	47	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n"
	48	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n"
	49	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
	50	+ pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
	51	+ articleElementBegin="<div class=\"news__heading__top"
	52	+ #articleElementEnd ="<div class=\"news__aside__feedback\">"
	53	+ articleElementEnd ="<hr class=\"divider-horizontal"
	54	+ indexElementBegin = htmlContent.index(articleElementBegin)
	55	+ indexElementEnd = htmlContent.index(articleElementEnd,indexElementBegin)
	56	+ article_only = "<div>"+htmlContent[indexElementBegin:indexElementEnd]
	57	+ lenBefore=len(article_only)
	58	+ say("LengthBefore: "+str(lenBefore))
	59	+
	60	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	61	+ article_only = re.sub(r"</amp-img>", '', article_only)
	62	+ article_only = re.sub(r"<h2", '<h3', article_only)
	63	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	64	+ article_only = re.sub(r"<h1", '<h2', article_only)
	65	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	66	+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M\|re.S)
	67	+ article_only = re.sub(r'<script(.+?)/>','',article_only)
	68	+ article_only = re.sub(r'<button(.+?)</button>','',article_only,flags=re.M\|re.S)
	69	+ article_only = re.sub(r' <div role="tooltip" id="disabled-pdf-notification">(.+?)</div>','',article_only,flags=re.M\|re.S)
	70	+ article_only = re.sub(r'<svg class="icon"(.+?)</svg>','',article_only,flags=re.M\|re.S)
	71	+ article_only = re.sub(r'<span class="is-visually-hidden">Partager(.+?)</span>','',article_only,flags=re.M\|re.S)
	72	+ #article_only = re.sub(r'<a href=\"(.+?)data-smarttag-name="partage_(.+?)"(.+?)data-smarttag-type="action">(.+?)</a>','AAAAA ',article_only,flags=re.M\|re.S)
	73	+ article_only = re.sub(r'<span>Offrir<span class="is-hidden-until-md"> l’article</span>','',article_only)
	74	+ #article_only = re.sub(r'','',article_only)
	75	+ article_only = re.sub(r"href=\"/",'href="https://mediapart.fr/',article_only)
	76	+ article_only = re.sub(r"src=\"/",'src="https://mediapart.fr/',article_only)
	77	+ article_only = re.sub(r"^$",'',article_only)
	78	+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M\|re.S)
	79	+ article_only = re.sub(r"><",'>\n<',article_only)
	80	+
	81	+ lenAfter=len(article_only)
	82	+ lenGain=float(10000-int(float(100lenAfter/lenBefore)100))/100
	83	+ say("LengthAfter : "+str(lenAfter))
	84	+ say("Gain : "+str(lenGain)+"%")
	85	+ #pageContent += "<article>"+article_only+"</article>"
	86	+ pageContent += article_only
	87	+ return pageContent
	88	+
	89	+
	90	+ #pageContent += "\n"+article_only+"\n"
	91	+ pageContent += "<article>\n"+article_only+"\n</article>\n"
	92	+ lenAfter=len(article_only)
	93	+ #lenGain=float(10000-int(float(100lenAfter/lenBefore)100))/100
	94	+ say("LengthAfter : "+str(lenAfter))
	95	+ #say("Gain : "+str(lenGain)+"%")
	96	+ return pageContent
	97	+

...	...	@@ -23,6 +23,7 @@ from .newsParser import newsLeFigaro
23	23	from .newsParser import newsLeMonde
24	24	from .newsParser import newsLeParisien
25	25	from .newsParser import newsLiberation
	26	+from .newsParser import newsMediapart
26	27	from .newsParser import newsMidiLibre
27	28	from .newsParser import newsMothershipSG
28	29	from .newsParser import newsNewYorker
...	...	@@ -44,6 +45,7 @@ from .newsParser import newsWaPo
44	45	from .newsParser import newsYahooCom
45	46	from .newsParser import newsZDNetFr
46	47	# ~ from .newsParser import newsXXXXXX
	48	+from .newsParser import accountMediapart
47	49
48	50	def supportedList():
49	51	current_module = __import__(__name__)
...	...	@@ -209,6 +211,8 @@ def getArticle(url):
209	211	data_page += newsParser.newsSlateCom.article(url)
210	212	elif "slate.fr" in url:
211	213	data_page += newsParser.newsSlateFr.article(url)
	214	+ elif "mediapart.fr" in url:
	215	+ data_page += newsParser.newsMediapart.article(url)
212	216	else:
213	217	data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
214	218	#data_page += "<p>Supported News:"