3b3c9f4 3 years ago
1 contributor
139 lines | 4.7kb
from userio import *
import requests
import re
import json

def article(url):
  say("Article: "+url)
  r = requests.get(url, allow_redirects=True)
  content = r.text

  #uuid extraction
  articleElementBegin ="name=\"cse_uuid\" content=\""
  articleElementEnd ="\"/>"
  indexElementBegin = content.index(articleElementBegin)
  indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
  entityUUID = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
  
  cstJsonBegin = "window.__APOLLO_STATE__="
  cstJsonEnd   = "</script><script>"
  indexBegin = content.index(cstJsonBegin)
  indexBegin += len(cstJsonBegin)
  indexEnd   = content.index(cstJsonEnd)
  raw_only = content[indexBegin:indexEnd]
  json_only = json.loads(raw_only)
  
  with open('data.json', 'w') as f:
    json.dump(json_only, f)

  applicationId = None
  json_article = None
  keyArticle = None
  for key in json_only["contentService"]["ROOT_QUERY"]:
    if "\"applicationId\":" in key:
      keySplit=key.split("\"")
      applicationId = keySplit[len(keySplit) - 2]
      keyArticle=json_only["contentService"]["ROOT_QUERY"][key]["id"]      
      


  json_article=json_only["contentService"][keyArticle]
  articleStrTitle = json_article["socialHeadline"]
  articleStrDescription = ""
  
  for key in json_article["summary"]["json"]:
    htmlType=key["type"]
    htmlContent=key["children"][0]["data"]  
    articleStrDescription+="<"+htmlType+">"+htmlContent+"</"+htmlType+">"
  
  pageContent = ""
  pageContent += "<meta property=\"og:type\" content=\"article\">"
  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">"
  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">"
  pageContent += "<meta property=\"og:url\" content=\""+url+"\">"
  #pageContent += "<meta property=\"og:image\" content=\""+articleStrImage+"\">"
  pageContent += "<article>"
  pageContent += "<h2>"+articleStrTitle+"</h2>\n"
  pageContent += "<em>"+articleStrDescription+"</em>\n"
  
  # Article Extraction attempt
  keyArticle2=""
  for key in json_article:
    if "body({\"customContents\"" in key: 
      keyArticle2=key

  #say("UUID :"+entityUUID)
  #say("AppID:"+applicationId)
  #say("Key  :"+keyArticle)      
  #say("Title:"+articleStrTitle)
  #say("Desc :"+articleStrDescription)
  json_article2 = json_article[keyArticle2]
  # ~ with open('data3.json', 'w') as f2:
    # ~ json.dump(json_article2, f2)
    
  cpt=0
  for element in json_article2["json"]:
    htmlType=element["type"]
    # ~ print("Bef Element: "+htmlType)
    if "ad1" in htmlType:
      continue
    elif "ad2" in htmlType:
      continue
    elif "ad3" in htmlType:
      continue
    elif "ad4" in htmlType:
      continue
    elif "ad5" in htmlType:
      continue
    elif "native-ads" in htmlType:
      continue
    elif "more-on-this" in htmlType:
      continue
    # ~ print("Aft Element: "+htmlType)
    try:
      htmlContent = element["children"]
    except:
      continue
    pageContent += "<"+htmlType+">"

    for elementChildren in htmlContent:  
      htmlTypeChildren=elementChildren["type"]
      if "text" == htmlTypeChildren:
        pageContent += elementChildren["data"]
      elif "a" == htmlTypeChildren:
        href=elementChildren["attribs"]["href"]
        pageContent += "<a href=\""+href+"\" target=\"new-"+str(cpt)+"\">"
        pageContent += elementChildren["children"][0]["data"]
        pageContent += "</"+htmlTypeChildren+">"
      elif "img" == htmlTypeChildren:
        src=elementChildren["attribs"]["src"]
        caption=elementChildren["attribs"]["title"]
        pageContent += "<img src=\""+src+"\">"
        pageContent += "<figcaption><em>"+caption+"</em></figcaption>"
        try:
          pageContent += elementChildren["children"][0]["data"]
        except:
          pass
      elif "iframe" == htmlTypeChildren:
        src=elementChildren["attribs"]["src"]
        caption=elementChildren["attribs"]["title"]
        pageContent += "<iframe src=\""+src+"\">"
        try:
          pageContent += elementChildren["children"][0]["data"]
        except:
          pass
        pageContent += "</iframe>"
        pageContent += "<figcaption><em><a href=\""+src+"\" target=\"new-"+str(cpt)+"\">"+caption+"</a></em></figcaption>"
      elif "em" == htmlTypeChildren:
        pageContent += "<"+htmlTypeChildren+">"
        try:
          pageContent += elementChildren["children"][0]["data"]
        except:
          pass
        pageContent += "</"+htmlTypeChildren+">"
      else:
        print("OTHER : "+htmlTypeChildren)
    pageContent += "</"+htmlType+">\n"
    cpt+=1
  pageContent+="</article>"
  return pageContent