1 contributor
from userio import *
import requests
import re
import newsParser
def articleImage(content):
articleImgBegin ="<meta property=\"og:image\" content=\""
articleImgEnd ="\" />"
indexImgBegin = content.index(articleImgBegin)
indexImgEnd = content.index(articleImgEnd,indexImgBegin)
image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
return image
def articleTitle(content):
articleImgBegin ="<meta property=\"og:title\" content=\""
articleImgEnd ="\" />"
indexImgBegin = content.index(articleImgBegin)
indexImgEnd = content.index(articleImgEnd,indexImgBegin)
title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
return title
def articleDescription(content):
articleImgBegin ="<meta property=\"og:description\" content=\""
articleImgEnd ="\" />"
indexImgBegin = content.index(articleImgBegin)
indexImgEnd = content.index(articleImgEnd,indexImgBegin)
title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
return title
def article(url):
say("Article: "+url)
r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
content = r.text
articleCstBegin = "<div class=\"c-entry-content \">"
articleCstBegin2 = "<article"
articleCstEnd = "<div class=\"u-hidden-text\" id=\"formatter-datter\""
# ~ articleCstEnd = "<section class=\"c-nextclick\">"
articleCstEnd2 = "<section class=\"c-related-list\">"
articleCstEnd3 = "</article"
articleStrImageUrl = articleImage(content)
articleStrTitle = articleTitle(content)
articleStrDescription = articleDescription(content)
pageContent = ""
pageContent += "<meta property=\"og:type\" content=\"article\">\n"
pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
try:
indexBegin = content.index(articleCstBegin)
except:
indexBegin = content.index(articleCstBegin2)
try:
indexEnd = content.index(articleCstEnd)
except:
try:
indexEnd = content.index(articleCstEnd2)
except:
indexEnd = content.index(articleCstEnd3)
article_only = ""
article_only += "<h2>"+articleStrTitle+"</h2>\n"
article_only += "<em>"+articleStrDescription+"</em>\n"
article_only += "<img src=\""+articleStrImageUrl+"\">\n"
article_only += content[indexBegin:indexEnd]
article_only = re.sub(r"<amp-img", '<img', article_only)
article_only = re.sub(r"</amp-img>", '', article_only)
article_only = re.sub(r"<h2", '<h3', article_only)
article_only = re.sub(r"</h2>", '</h3>', article_only)
article_only = re.sub(r"<h1", '<h2', article_only)
article_only = re.sub(r"</h1>", '</h2>', article_only)
article_only = re.sub(r"href=\"\/", 'href=\"//www.theverge.com/', article_only)
article_only = re.sub(r"src=\"\/", 'src=\"//www.theverge.com/', article_only)
pageContent += "<article>"+article_only+"</article>"
return pageContent