3b3c9f4 3 years ago
1 contributor
58 lines | 3.025kb
from userio import *
import requests
import re
import json
import newsParser

def article(url):
  say("Article: "+url)
  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
  content = r.text
  
  articleCstBegin = "<article-component :article=\""
  articleCstEnd   = "\" :nid="
  indexBegin = content.index(articleCstBegin)
  indexEnd   = content.index(articleCstEnd)
  article_json = content[indexBegin+len(articleCstBegin):indexEnd]
  article_json = article_json.replace("&quot;","\"")
  article_json = article_json.replace("\/","/")
  article_json = article_json.replace("&lt;","<")
  article_json = article_json.replace("&gt;",">")
  jsonArticle = json.loads(article_json)
  
  article_only = ""
  articleStrImageUrl = jsonArticle['field_article_images'][0]['url']
  articleStrImageCaption = jsonArticle['field_article_images'][0]['caption']
  articleStrTitle = jsonArticle['title']
  
  articleStrDescription = newsParser.articleDescription(content)
  
  pageContent = ""
  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
  
  article_only += "<h2>"+articleStrTitle+"</h2>\n"
  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
  if None is not articleStrImageCaption:
    article_only += "<em>"+articleStrImageCaption+"</em>\n"
  article_only += jsonArticle['body']
  article_only = re.sub(r"<amp-img", '<img', article_only)
  article_only = re.sub(r"</amp-img>", '', article_only)
  article_only = re.sub(r"<h2", '<h3', article_only)
  article_only = re.sub(r"</h2>", '</h3>', article_only)
  article_only = re.sub(r"<h1", '<h2', article_only)
  article_only = re.sub(r"</h1>", '</h2>', article_only)
  article_only = re.sub(r"<a class=\"share(.*?)\" data-social-name=\"(.*?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
  article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
  article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
  article_only = re.sub(r"<div class=\"(.*?) share-bar(.*?)>",'<div style="display:none;">', article_only)
  article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
  article_only = article_only.replace("><", ">\n<")
  
  article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only)
  pageContent += "<article>"+article_only+"</article>"
  return pageContent