from userio import * import requests import re import newsParser from requests_html import HTML from requests_html import HTMLSession def article(url): say("Article: "+url) if not "/amphtml" in url: say("Trying AMP") url = url.replace("buzzfeednews.com/article","buzzfeednews.com/amphtml") url = url.replace("buzzfeed.com/","buzzfeed.com/amphtml/") url.replace("?origin=web-hf","") session = HTMLSession() response = session.get(url,timeout=20,headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) pageContent="" article_only="" with response as r: #articleStrTitle = r.html.xpath('//meta[@property="og:title"]/@content')[0] #articleStrDescription = r.html.xpath('//meta[@property="og:description"]/@content')[0] #articleStrImageUrl = r.html.xpath('//meta[@property="og:image"]/@content')[0] #articleStrAuthor = r.html.xpath('//div[@class="author_wrapper"]/@content') #print(articleStrAuthor) article=r.html.find("main")[0] #article=r.html.find("body")[0] article_only+=article.html lenBefore=len(article_only) say("LengthBefore: "+str(lenBefore)) pageContent += "\n" #pageContent += "\n" #pageContent += "\n" pageContent += "\n" #pageContent += "\n" pageContent += "\n" #pageContent += "\n" article_only = re.sub(r"", '', article_only) article_only = re.sub(r"", '', article_only) article_only = re.sub(r"", '', article_only) article_only = re.sub(r'