newsProxy/newsParser/newsParser/newsBloomberg.py at 283cedddf9c7b60af51d756d1f5c4793a5c80fba ・ ycawidro/newsProxy ・ Gitprep

newsProxy / newsParser / newsParser / newsBloomberg.py /
1 contributor
53 lines | 2.618kb
from userio import *
import requests
import re
import newsParser

def article(url):
  say("Article: "+url)
  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
  content = r.text
  articleCstBegin = "<div class=\"article-content\">"
  articleCstBegin2 = "<time class=\"article-timestamp\""
  articleCstEnd   = "<div class=\"bottom-left-rail-touts-spacer\">"
  try:
    indexBegin = content.index(articleCstBegin)
  except:
    try:
      indexBegin = content.index(articleCstBegin2)
    except:
      indexBegin = 0
  try:
    indexEnd   = content.index(articleCstEnd)
  except:
    indexEnd   = 0
  articleStrImageUrl = newsParser.articleImage(content)
  articleStrTitle = newsParser.articleTitle(content)
  articleStrDescription = newsParser.articleDescription(content)
  
  pageContent = ""
  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
  
  article_only = ""
  article_only += "<h2>"+articleStrTitle+"</h2>\n"
  article_only += "<em>"+articleStrDescription+"</em>\n"
  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
  article_only += content[indexBegin:indexEnd]
  article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
  article_only = re.sub(r"<h2", '<h3', article_only)
  article_only = re.sub(r"</h2>", '</h3>', article_only)
  article_only = re.sub(r"<h1", '<h2', article_only)
  article_only = re.sub(r"</h1>", '</h2>', article_only)
  article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
  article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
  article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
  article_only = article_only.replace("><", ">\n<")
  
  article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
  pageContent += "<article>"+article_only+"</article>"
  return pageContent