Added Slate.com ・ 53a3983 ・ Gitprep

+140

newsParser/newsParser/newsSlateCom.py

...	...	@@ -0,0 +1,140 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+from requests_html import HTML
	5	+from requests_html import HTMLSession
	6	+
	7	+def articleImage(content):
	8	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	9	+ articleImgEnd ="\""
	10	+ indexImgBegin = content.index(articleImgBegin)
	11	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	12	+ try:
	13	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	14	+ except:
	15	+ image = "favicon.png"
	16	+ return image
	17	+
	18	+def articleDescription(content):
	19	+ articleImgBegin ="<meta property=\"og:description\" content=\""
	20	+ articleImgEnd ="\""
	21	+ indexImgBegin = content.index(articleImgBegin)
	22	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	23	+ try:
	24	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	25	+ except:
	26	+ title = "Description Extraction Failed"
	27	+ return title
	28	+
	29	+def articleTitle(content):
	30	+ #articleImgBegin ="<meta property=\"og:title\" content=\""
	31	+ articleImgBegin ="\"og:title\" content=\""
	32	+ articleImgEnd ="\""
	33	+ indexImgBegin = content.index(articleImgBegin)
	34	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	35	+ try:
	36	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	37	+ except:
	38	+ title = "Title Extraction Failed"
	39	+ return title
	40	+
	41	+def article(url):
	42	+ say("Article: "+url)
	43	+ session = HTMLSession()
	44	+ response = session.get(url,timeout=20)
	45	+ pageContent=""
	46	+ article_only=""
	47	+ with response as r:
	48	+ #articleStrTitle=r.html.find('title')[0].text
	49	+ articleStrTitle = r.html.xpath('//meta[@property="og:title"]/@content')[0]
	50	+ articleStrDescription = r.html.xpath('//meta[@property="og:description"]/@content')[0]
	51	+ articleStrImageUrl = r.html.xpath('//meta[@property="og:image"]/@content')[0]
	52	+ articleStrAuthor = r.html.xpath('//meta[@name="author"]/@content')[0]
	53	+ article=r.html.find("article")[0]
	54	+ pageContent += "<meta property=\"og:type\" content=\"article\" />\n"
	55	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\" />\n"
	56	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\" />\n"
	57	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\" />\n"
	58	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\" />\n"
	59	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\" />\n"
	60	+ pageContent += "<meta name=\"author\" content=\""+articleStrAuthor+"\" />\n"
	61	+ article_only+=article.html
	62	+
	63	+
	64	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	65	+ article_only = re.sub(r"</amp-img>", '', article_only)
	66	+ article_only = re.sub(r"<h2", '<h3', article_only)
	67	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	68	+ article_only = re.sub(r"<h1", '<h2', article_only)
	69	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	70	+ article_only = re.sub(r"<div class=\"slate-ad__label\">Advertisement</div>",'',article_only)
	71	+ article_only = re.sub(r"<!-- data-uri=(.+?)-->",'',article_only)
	72	+ article_only = re.sub(r"<script data-uri=\"(.+?)</script>",'',article_only,flags=re.M\|re.S)
	73	+ article_only = re.sub(r"<p class=\"slate-paragraph (.+?)\" data-word-count=\"(.+?)\" data-uri=\"(.+?)\">",'<p>',article_only)
	74	+ article_only = re.sub(r"<div class=\"slate-ad__creative\"(.+?)</div>",'',article_only,flags=re.M\|re.S)
	75	+ article_only = re.sub(r"<div class=\"slate-ad (.+?)</div>",'',article_only,flags=re.M\|re.S)
	76	+ article_only = re.sub(r"<div class=\"article__share-sidebar-xl share-sidebar\">(.+?)</div>",'',article_only,flags=re.M\|re.S)
	77	+ article_only = re.sub(r"<div class=\"article__share-sidebar share-sidebar\">(.+?)</div>",'',article_only,flags=re.M\|re.S)
	78	+ article_only = re.sub(r"<div class=\"article__left-rail\">(.+?)</div>",'',article_only,flags=re.M\|re.S)
	79	+ article_only = re.sub(r"<div class=\"article__podcast-subscribe\"/>(.+?)</div>",'',article_only,flags=re.M\|re.S)
	80	+ article_only = re.sub(r"<div class=\"social-share\" aria-label=\"social media links\"(.+?)>(.+?)</div>",'',article_only,flags=re.M\|re.S)
	81	+ article_only = re.sub(r"<img (.+?)src=\"(.+?)\"(.+?)>",r'<img src="\2">',article_only)
	82	+ article_only = re.sub(r"data-uri=\"slate.com",'data-uri="https://slate.com',article_only)
	83	+ article_only = re.sub(r"<aside(.+)</aside>",'',article_only,flags=re.M\|re.S)
	84	+ article_only = re.sub(r"<aside(.+)/>",'',article_only,flags=re.M\|re.S)
	85	+ article_only = re.sub(r"<noscript(.+)</noscript>",'',article_only,flags=re.M\|re.S)
	86	+ #article_only = re.sub(r"",'',article_only)
	87	+ article_only = re.sub(r"href=\"/",'href="https://slate.com/,',article_only)
	88	+ article_only = re.sub(r"^$",'',article_only)
	89	+ article_only = re.sub(r'^\s*$', '',article_only,flags=re.M\|re.S)
	90	+
	91	+ pageContent += "<article>\n"+article_only+"\n</article>\n"
	92	+ say("Length: "+str(len(article_only)))
	93	+ return pageContent
	94	+
	95	+def articleOld(url):
	96	+ say("Article: "+url)
	97	+ r = requests.get(url, allow_redirects=True)
	98	+ content = r.text
	99	+ r.html.find('article')
	100	+
	101	+ articleStrImageUrl = articleImage(content)
	102	+ articleStrTitle = articleTitle(content)
	103	+ articleStrDescription = articleDescription(content)
	104	+
	105	+ pageContent = ""
	106	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	107	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	108	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	109	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	110	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	111	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">\n"
	112	+
	113	+ articleCstBegin = "<article"
	114	+ articleCstEnd = "</article>"
	115	+ articleCstBegin2 = "<body"
	116	+ articleCstEnd2 = "</body>"
	117	+ try:
	118	+ indexBegin = content.index(articleCstBegin)
	119	+ except:
	120	+ try:
	121	+ indexBegin = content.index(articleCstBegin2)
	122	+ except:
	123	+ indexBegin = 0
	124	+ try:
	125	+ indexEnd = content.index(articleCstEnd)
	126	+ except:
	127	+ try:
	128	+ indexEnd = content.index(articleCstEnd2)
	129	+ except:
	130	+ indexEnd = strlen(content)
	131	+ article_only = content[indexBegin:indexEnd]
	132	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	133	+ article_only = re.sub(r"</amp-img>", '', article_only)
	134	+ article_only = re.sub(r"<h2", '<h3', article_only)
	135	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	136	+ article_only = re.sub(r"<h1", '<h2', article_only)
	137	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	138	+ article_only = article_only.replace("><", ">\n<")
	139	+ pageContent += "<article>"+article_only+"</article>"
	140	+ return pageContent