newsProxy/newsParser/newsParser/newsFrandroidCom.py at 1ffd4577ca42d6aa0c24342e01bc488a66a786bd ・ ycawidro/newsProxy ・ Gitprep

newsProxy / newsParser / newsParser / newsFrandroidCom.py /

1 contributor

48 lines | 1.9kb

from userio import *
import requests
import re
import newsParser
  
def article(url):
  say("Article: "+url)
  r = requests.get(url, allow_redirects=True)
  content = r.text
  articleCstBegin = "<div class=\"article-content"
  articleCstEnd   = " <p class=\"title\">"
  articleCstEnd2   = "<div class=\"article-footer"
  articleCstEnd3   = "</article>"
  articleStrTitle = newsParser.articleTitle(content)
  articleStrDescription = newsParser.articleDescription(content)
  articleStrImageUrl = newsParser.articleImage(content)
  
  
  pageContent = ""
  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
  
  indexBegin = content.index(articleCstBegin)
  try:
    indexEnd   = content.index(articleCstEnd)
  except:
    try:
      indexEnd   = content.index(articleCstEnd2)
    except:
      indexEnd   = content.index(articleCstEnd3)
  article_only = ""
  article_only += "<h2>"+articleStrTitle+"</h2>\n"
  article_only += content[indexBegin:indexEnd]
  article_only = re.sub(r"<amp-img", '<img', article_only)
  article_only = re.sub(r"</amp-img>", '', article_only)
  article_only = re.sub(r"<h2", '<h3', article_only)
  article_only = re.sub(r"</h2>", '</h3>', article_only)
  article_only = re.sub(r"<h1", '<h2', article_only)
  article_only = re.sub(r"</h1>", '</h2>', article_only)
  article_only = article_only.replace("><", ">\n<")
  
  article_only = re.sub(r"href=\"\/", 'href=\"//www.frandroid.com/', article_only)
  pageContent += article_only
  return pageContent