from userio import *
import requests
import re
import newsParser
def localArticleTitle(content):
articleElementBegin=""
indexElementBegin = content.index(articleElementBegin)
indexElementEnd = content.index(articleElementEnd,indexElementBegin)
return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
def localArticleDescription(content):
articleElementBegin=""
indexElementBegin = content.index(articleElementBegin)
indexElementEnd = content.index(articleElementEnd,indexElementBegin)
return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
return ""
def localArticleImage(content):
articleElementBegin=""
indexElementBegin = content.index(articleElementBegin)
indexElementEnd = content.index(articleElementEnd,indexElementBegin)
return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
return ""
def article(url):
say("Article: "+url)
r = requests.get(url, allow_redirects=True)
r.encoding = r.apparent_encoding
content = r.text
articleStrImageUrl = localArticleImage(content)
articleStrTitle = localArticleTitle(content)
articleStrDescription = localArticleDescription(content)
articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl)
pageContent = ""
pageContent += ""
pageContent += "\n"
pageContent += "\n"
pageContent += "\n"
pageContent += "\n"
pageContent += "\n"
pageContent += ""
articleCstBegin = ""
articleCstEnd2 = ""
articleCstEnd3 = "
"
indexBegin = content.index(articleCstBegin)
try:
indexEnd = content.index(articleCstEnd)
except:
try:
indexEnd = content.index(articleCstEnd2)
except:
indexEnd = content.index(articleCstEnd3)
debug("indexBegin: "+str(indexBegin))
debug("indexEnd : "+str(indexEnd))
say("Title: "+articleStrTitle)
say("Image: "+articleStrImageUrl)
article_only = "
"+articleStrTitle+"
"
article_only += "

"
article_only += "
"+articleStrDescription+""
with open("titi.html", "w") as f2:
f2.write(content[indexBegin:indexEnd])
f2.close
article_only += content[indexBegin:indexEnd]
article_only = re.sub(r"
", '', article_only)
article_only = re.sub(r"", '', article_only)
article_only = re.sub(r"", '
', article_only)
# ~ article_only = re.sub(r"Advertisement
", '', article_only)
article_only = re.sub(r"Advertisement
", '', article_only)
#article_only = re.sub(r"Story continues below advertisement
", '', article_only)
article_only = re.sub(r"Story continues below advertisement
","", article_only)
article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only)
article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only)
article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only)
article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only)
article_only = re.sub(r"style=\"min-height:250px\"", 'style=\"min-height:1px\"', article_only)
article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only)
article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only)
article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)
article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)
article_only = re.sub(r"", '
', article_only)
article_only = re.sub(r"filter:blur\(10px\);", '', article_only)
article_only = re.sub(r"
", '
', article_only)
article_only = re.sub(r"
", '
', article_only)
article_only = re.sub(r"

", '', article_only)
article_only = re.sub(r"
", '
', article_only)
#article_only = re.sub(r"