Added slate.fr ・ 3af915c ・ Gitprep

+114

newsParser/newsParser/newsSlateFr.py

...	...	@@ -0,0 +1,114 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+from requests_html import HTML
	5	+from requests_html import HTMLSession
	6	+
	7	+def articleImage(content):
	8	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	9	+ articleImgEnd ="\""
	10	+ indexImgBegin = content.index(articleImgBegin)
	11	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	12	+ try:
	13	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	14	+ except:
	15	+ image = "favicon.png"
	16	+ return image
	17	+
	18	+def articleDescription(content):
	19	+ articleImgBegin ="<meta property=\"og:description\" content=\""
	20	+ articleImgEnd ="\""
	21	+ indexImgBegin = content.index(articleImgBegin)
	22	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	23	+ try:
	24	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	25	+ except:
	26	+ title = "Description Extraction Failed"
	27	+ return title
	28	+
	29	+def articleTitle(content):
	30	+ #articleImgBegin ="<meta property=\"og:title\" content=\""
	31	+ articleImgBegin ="\"og:title\" content=\""
	32	+ articleImgEnd ="\""
	33	+ indexImgBegin = content.index(articleImgBegin)
	34	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	35	+ try:
	36	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	37	+ except:
	38	+ title = "Title Extraction Failed"
	39	+ return title
	40	+
	41	+def article(url):
	42	+ say("Article: "+url)
	43	+ session = HTMLSession()
	44	+ response = session.get(url,timeout=20)
	45	+ pageContent=""
	46	+ article_only=""
	47	+ with response as r:
	48	+ #articleStrTitle=r.html.find('title')[0].text
	49	+ articleStrTitle = r.html.xpath('//meta[@property="og:title"]/@content')[0]
	50	+ articleStrDescription = r.html.xpath('//meta[@property="og:description"]/@content')[0]
	51	+ articleStrImageUrl = r.html.xpath('//meta[@property="og:image"]/@content')[0]
	52	+ articleStrAuthor = r.html.xpath('//meta[@name="author"]/@content')[0]
	53	+ article=r.html.find("article")[0]
	54	+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
	55	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
	56	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n"
	57	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n"
	58	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n"
	59	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
	60	+ pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
	61	+ article_only+=article.html
	62	+
	63	+
	64	+ lenBefore=len(article_only)
	65	+ say("LengthBefore: "+str(lenBefore))
	66	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	67	+ article_only = re.sub(r"</amp-img>", '', article_only)
	68	+ article_only = re.sub(r"<h2", '<h3', article_only)
	69	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	70	+ article_only = re.sub(r"<h1", '<h2', article_only)
	71	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	72	+ #article_only = re.sub(r"<div class=\"slate-ad__label\">Advertisement</div>",'',article_only)
	73	+ #article_only = re.sub(r"<!-- data-uri=(.+?)-->",'',article_only)
	74	+ #article_only = re.sub(r"<script data-uri=\"(.+?)</script>",'',article_only,flags=re.M\|re.S)
	75	+ #article_only = re.sub(r"<p class=\"slate-paragraph (.+?)\" data-word-count=\"(.+?)\" data-uri=\"(.+?)\">",'<p>',article_only)
	76	+ #article_only = re.sub(r"<div class=\"slate-ad__creative\"(.+?)</div>",'',article_only,flags=re.M\|re.S)
	77	+ #article_only = re.sub(r"<div class=\"slate-ad (.+?)</div>",'',article_only,flags=re.M\|re.S)
	78	+ #article_only = re.sub(r"<div class=\"article__share-sidebar-xl share-sidebar\">(.+?)</div>",'',article_only,flags=re.M\|re.S)
	79	+ #article_only = re.sub(r"<div class=\"article__share-sidebar share-sidebar\">(.+?)</div>",'',article_only,flags=re.M\|re.S)
	80	+ #article_only = re.sub(r"<div class=\"article__left-rail\">(.+?)</div>",'',article_only,flags=re.M\|re.S)
	81	+ #article_only = re.sub(r"<div class=\"article__podcast-subscribe\"/>(.+?)</div>",'',article_only,flags=re.M\|re.S)
	82	+ #article_only = re.sub(r"<div class=\"social-share\" aria-label=\"social media links\"(.+?)>(.+?)</div>",'',article_only,flags=re.M\|re.S)
	83	+ #article_only = re.sub(r"<img (.+?)src=\"(.+?)\"(.+?)>",r'<img src="\2">',article_only)
	84	+ #article_only = re.sub(r"data-uri=\"slate.com",'data-uri="https://slate.com',article_only)
	85	+ #article_only = re.sub(r"<aside(.+)</aside>",'',article_only,flags=re.M\|re.S)
	86	+ #article_only = re.sub(r"<aside(.+)/>",'',article_only,flags=re.M\|re.S)
	87	+ article_only = re.sub(r"<script(.+)</script>",'',article_only,flags=re.M\|re.S)
	88	+ article_only = re.sub(r"<nav class=\"footer__socials\">(.+)</nav>",'',article_only,flags=re.M\|re.S)
	89	+
	90	+ article_only = re.sub(r"<form method=\"post\" target=\"_blank\">(.+?)</form>",'',article_only,flags=re.M\|re.S)
	91	+ article_only = re.sub(r"<div class=\"article-header(.+?)\">",'<div>',article_only)
	92	+ article_only = re.sub(r"<button class=\"sharing-btn(.+?)</button>",'',article_only)
	93	+ article_only = re.sub(r"<div class=\"article-content(.+?)\">",'<div>',article_only)
	94	+ article_only = re.sub(r" class=\"article-header__title\"",'',article_only)
	95	+ article_only = re.sub(r" class=\"article-header__chapo\"",'',article_only)
	96	+ article_only = re.sub(r"<div class=\"row\">",'<div>',article_only)
	97	+ article_only = re.sub(r"<div (.+?)/>",'',article_only)
	98	+ #article_only = re.sub(r"",'',article_only)
	99	+ article_only = re.sub(r"href=\"/",'href="https://slate.fr/',article_only)
	100	+ article_only = re.sub(r"src=\"/",'src="https://slate.fr/',article_only)
	101	+ article_only = re.sub(r"<img class=\"image image--full lazyload\" data-full-src=\"",'<img src="',article_only)
	102	+ article_only = re.sub(r"<div class=\"sharing-tools(.+?)>(.+?)</div>",'',article_only,flags=re.M\|re.S)
	103	+ article_only = re.sub(r"<div class=\"ad(.+?)>(.+?)</div>",'',article_only,flags=re.M\|re.S)
	104	+ article_only = re.sub(r"^$",'',article_only)
	105	+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M\|re.S)
	106	+
	107	+ #pageContent += "<article>\n"+article_only+"\n</article>\n"
	108	+ pageContent += article_only+"\n"
	109	+ lenAfter=len(article_only)
	110	+ lenGain=float(10000-int(float(100lenAfter/lenBefore)100))/100
	111	+ say("LengthAfter : "+str(lenAfter))
	112	+ say("Gain : "+str(lenGain)+"%")
	113	+ return pageContent
	114	+

●	newsParser/__init__.py	+3
+	newsParser/newsParser/newsSlateFr.py	+114

...	...	@@ -46,6 +46,7 @@ from .newsParser import newsBFM
46	46	from .newsParser import newsDefault
47	47	from .newsParser import newsLNC
48	48	from .newsParser import newsSlateCom
	49	+from .newsParser import newsSlateFr
49	50	# ~ from .newsParser import newsTodayOnlineSG
50	51
51	52	def supportedList():
...	...	@@ -212,6 +213,8 @@ def getArticle(url):
212	213	data_page += newsParser.newsLNC.article(url)
213	214	elif "slate.com" in url:
214	215	data_page += newsParser.newsSlateCom.article(url)
	216	+ elif "slate.fr" in url:
	217	+ data_page += newsParser.newsSlateFr.article(url)
215	218	else:
216	219	data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
217	220	#data_page += "<p>Supported News:"