from userio import * import requests import re import newsParser def localArticleTitle(content): articleElementBegin="" indexElementBegin = content.index(articleElementBegin) indexElementEnd = content.index(articleElementEnd,indexElementBegin) return content[indexElementBegin+len(articleElementBegin):indexElementEnd] def localArticleDescription(content): articleElementBegin="" indexElementBegin = content.index(articleElementBegin) indexElementEnd = content.index(articleElementEnd,indexElementBegin) return content[indexElementBegin+len(articleElementBegin):indexElementEnd] return "" def localArticleImage(content): articleElementBegin="" indexElementBegin = content.index(articleElementBegin) indexElementEnd = content.index(articleElementEnd,indexElementBegin) return content[indexElementBegin+len(articleElementBegin):indexElementEnd] return "" def article(url): say("Article: "+url) r = requests.get(url, allow_redirects=True) r.encoding = r.apparent_encoding content = r.text articleStrImageUrl = localArticleImage(content) articleStrTitle = localArticleTitle(content) articleStrDescription = localArticleDescription(content) articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl) pageContent = "" pageContent += "" pageContent += "\n" pageContent += "\n" pageContent += "\n" pageContent += "\n" pageContent += "\n" pageContent += "" articleCstBegin = "" articleCstEnd2 = "" articleCstEnd3 = "
" indexBegin = content.index(articleCstBegin) try: indexEnd = content.index(articleCstEnd) except: try: indexEnd = content.index(articleCstEnd2) except: indexEnd = content.index(articleCstEnd3) debug("indexBegin: "+str(indexBegin)) debug("indexEnd : "+str(indexEnd)) say("Title: "+articleStrTitle) say("Image: "+articleStrImageUrl) article_only = "

"+articleStrTitle+"

" article_only += "" article_only += ""+articleStrDescription+"" with open("titi.html", "w") as f2: f2.write(content[indexBegin:indexEnd]) f2.close article_only += content[indexBegin:indexEnd] article_only = re.sub(r"", '', article_only) article_only = re.sub(r"", '', article_only) article_only = re.sub(r"", '', article_only) # ~ article_only = re.sub(r"
Advertisement
", '', article_only) article_only = re.sub(r"
Advertisement
", '', article_only) #article_only = re.sub(r"
Story continues below advertisement
", '', article_only) article_only = re.sub(r"
Story continues below advertisement
","", article_only) article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only) article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only) article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only) article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only) article_only = re.sub(r"style=\"min-height:250px\"", 'style=\"min-height:1px\"', article_only) article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only) article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only) article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only) article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only) article_only = re.sub(r"
", '
', article_only) article_only = re.sub(r"filter:blur\(10px\);", '', article_only) article_only = re.sub(r"
", '
', article_only) article_only = re.sub(r"
", '
', article_only) article_only = re.sub(r"", '', article_only) article_only = re.sub(r"
", '
', article_only) #article_only = re.sub(r"", '', article_only) #article_only = re.sub(r"", '', article_only) article_only = article_only.replace("><", ">\n<") pageContent += "
"+article_only+"
" return pageContent