from userio import * import requests import re import newsParser from requests_html import HTML from requests_html import HTMLSession def article(url): say("Article: "+url) session = HTMLSession() response = session.get(url,timeout=20) pageContent="" article_only="" with response as r: articleStrTitle = r.html.xpath('//meta[@property="og:title"]/@content')[0] articleStrDescription = r.html.xpath('//meta[@property="og:description"]/@content')[0] articleStrImageUrl = r.html.xpath('//meta[@property="og:image"]/@content')[0] articleStrAuthor = r.html.xpath('//div[@class="author_wrapper"]/@content') article=r.html.find("main")[0] article_only+=article.html lenBefore=len(article_only) say("LengthBefore: "+str(lenBefore)) pageContent += "\n" pageContent += "\n" pageContent += "\n" pageContent += "\n" pageContent += "\n" pageContent += "\n" #pageContent += "\n" article_only = re.sub(r"", '', article_only) article_only = re.sub(r"", '', article_only) article_only = re.sub(r"", '', article_only) article_only = re.sub(r'','',article_only,flags=re.M|re.S) article_only = re.sub(r'','',article_only) #article_only = re.sub(r'','',article_only) article_only = re.sub(r"href=\"/",'href="https://xxxxx/',article_only) article_only = re.sub(r"src=\"/",'src="https://xxxxx/',article_only) article_only = re.sub(r"^$",'',article_only) article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S) article_only = re.sub(r"><",'>\n<',article_only) #pageContent += "\n"+article_only+"\n" pageContent += "
\n"+article_only+"\n
\n" lenAfter=len(article_only) lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100 say("LengthAfter : "+str(lenAfter)) say("Gain : "+str(lenGain)+"%") return pageContent