9e78dac 3 years ago
1 contributor
106 lines | 5.55kb
from userio import *
import requests
import re
import newsParser

def localArticleTitle(content):
  articleElementBegin="<meta property=\"og:title\" content=\""
  articleElementEnd  ="\"/>"
  indexElementBegin  = content.index(articleElementBegin)
  indexElementEnd    = content.index(articleElementEnd,indexElementBegin)
  return content[indexElementBegin+len(articleElementBegin):indexElementEnd]

def localArticleDescription(content):
  articleElementBegin="<meta property=\"og:description\" content=\""
  articleElementEnd  ="\"/>"
  indexElementBegin  = content.index(articleElementBegin)
  indexElementEnd    = content.index(articleElementEnd,indexElementBegin)
  return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
  return ""

def localArticleImage(content):
  articleElementBegin="<meta property=\"og:image\" content=\""
  articleElementEnd  ="\"/>"
  indexElementBegin  = content.index(articleElementBegin)
  indexElementEnd    = content.index(articleElementEnd,indexElementBegin)
  return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
  return ""

def article(url):
  say("Article: "+url)
  r = requests.get(url, allow_redirects=True)
  r.encoding = r.apparent_encoding
  content = r.text
  
  articleStrImageUrl = localArticleImage(content)
  articleStrTitle = localArticleTitle(content)
  articleStrDescription = localArticleDescription(content)
  articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl)
  
  pageContent = ""
  pageContent += "<meta charset=\"utf-8\"/>"
  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
  
  articleCstBegin = "<article"
  articleCstEnd   = "<div class=\"mt-md\">"
  articleCstEnd2   = "</article>"
  articleCstEnd3   = "<div class=\"flex mt-md\">"
  indexBegin = content.index(articleCstBegin)
  try:
    indexEnd  = content.index(articleCstEnd)
  except:
    try:
      indexEnd  = content.index(articleCstEnd2) 
    except:
      indexEnd = content.index(articleCstEnd3)
  debug("indexBegin: "+str(indexBegin))
  debug("indexEnd  : "+str(indexEnd))
  say("Title: "+articleStrTitle)
  say("Image: "+articleStrImageUrl)


  article_only = "<h2>"+articleStrTitle+"</h2>"
  article_only += "<img src=\""+articleStrImageUrl+"\">"
  article_only += "<em>"+articleStrDescription+"</em>"

  with open("titi.html", "w") as f2:
      f2.write(content[indexBegin:indexEnd])
      f2.close
  article_only += content[indexBegin:indexEnd]
  article_only = re.sub(r"<amp-img", '<img', article_only)
  article_only = re.sub(r"</amp-img>", '', article_only)
  article_only = re.sub(r"<h2", '<h3', article_only)
  article_only = re.sub(r"</h2>", '</h3>', article_only)
  article_only = re.sub(r"<h1", '<h2', article_only)
  article_only = re.sub(r"</h1>", '</h2>', article_only)
  # ~ article_only = re.sub(r"<div data-sc-v=\"4.23.4\" data-sc-c=\"placeholder\">Advertisement</div>", '</h2>', article_only)
  article_only = re.sub(r"<div data-sc-v=\"4\.24\.3\" data-sc-c=\"placeholder\">Advertisement</div>", '', article_only)
  #article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only)
  article_only = re.sub(r"<div class=\"dib flex divider.+?data-sc-c=\"adslot\">Story continues below advertisement</div>","", article_only)

  article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only)
  article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only)
  article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only)
  article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only)
  article_only = re.sub(r"style=\"min-height:250px\"", 'style=\"min-height:1px\"', article_only)
  article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only)
  article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only)
  article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)
  article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)
  article_only = re.sub(r"<div data-qa=\"drop-cap-letter\">", '<div>', article_only)
  article_only = re.sub(r"filter:blur\(10px\);", '', article_only)
  article_only = re.sub(r"<div class=\"bg-pattern-1\".+?>", '<div>', article_only)
  article_only = re.sub(r"<div class=\"bg-pattern-2\".+?>", '<div>', article_only)
  article_only = re.sub(r"<img class=\"dn canvas-foreground\" src=\".+?\"/>", '', article_only)
  article_only = re.sub(r"<div class=\"subhead .+?>", '<div>', article_only)
  #article_only = re.sub(r"<canvas id=\"artboard\" style=\".+\">", '<canvas>', article_only)
  #article_only = re.sub(r"", '', article_only)
  article_only = article_only.replace("><", ">\n<")

  pageContent += "<article>"+article_only+"</article>"
  return pageContent