newsProxy / newsParser / newsParser / newsMothershipSG.py /
3b3c9f4 3 years ago
1 contributor
57 lines | 2.959kb
from userio import *
import requests
import re
import newsParser

def article(url):
  say("Article: "+url)
  r = requests.get(url, allow_redirects=True)
  content = r.text

  articleStrImageUrl = newsParser.articleImage(content)
  articleStrTitle = newsParser.articleTitle(content)
  articleStrDescription = newsParser.articleDescription(content)
  
  pageContent = ""
  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
  
  articleCstBegin = "<div class=\"main-item\" "
  articleCstEnd2   = "<div class=\"social-share bottom\">"
  indexBegin = content.index(articleCstBegin)
  indexEnd = content.index(articleCstEnd2,indexBegin)
  articleStrImageUrl = newsParser.articleImage(content)
  articleStrTitle = newsParser.articleTitle(content)
  
  article_only = "<h2>"+articleStrTitle+"</h2>\n"
  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
  article_only += content[indexBegin:indexEnd]
  article_only = re.sub(r"<amp-img", '<img', article_only)
  article_only = re.sub(r"</amp-img>", '', article_only)
  article_only = re.sub(r"<h2", '<h3', article_only)
  article_only = re.sub(r"</h2>", '</h3>', article_only)
  article_only = re.sub(r"<h1", '<h2', article_only)
  article_only = re.sub(r"</h1>", '</h2>', article_only)
  newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">"
  article_only = re.sub(r"<div class=\"article__author-link\">", newImage, article_only)
  article_only = re.sub(r"<span class=\"advertisement__title\">Advertisement</span>", '', article_only)
  
  article_only = re.sub(r"class=\"picture__image lazyload\"", '', article_only)
  article_only = re.sub(r"<a class=\"addthis_button(.*)</a>", '', article_only)
  article_only = re.sub(r"<div class=\"c-sharing--default is-article-top-position\"", '<div class="c-sharing--default is-article-top-position" style="display:none"', article_only)
  article_only = re.sub(r"<h3 class=\"save-for-later__title\">Bookmark</h3>", '', article_only)
  article_only = re.sub(r"<h2", '<h3', article_only)
  article_only = re.sub(r"</h2>", '</h3>', article_only)
  article_only = re.sub(r"<h1", '<h2', article_only)
  article_only = re.sub(r"</h1>", '</h2>', article_only)
  article_only = article_only.replace("><", ">\n<")
  
  article_only = re.sub(r"href=\"\/", 'href=\"///mothership.sg/', article_only)
  article_only = re.sub(r"src=\"\/", 'src=\"///mothership.sg/', article_only)
  article_only = re.sub(r"src='\/", "src='//mothership.sg/", article_only)
  pageContent += "<article>"+article_only+"</article>"
  return pageContent