1 contributor
from userio import *
import requests
import re
import json
def article(url):
say("Article: "+url)
r = requests.get(url, allow_redirects=True)
content = r.text
#uuid extraction
articleElementBegin ="name=\"cse_uuid\" content=\""
articleElementEnd ="\"/>"
indexElementBegin = content.index(articleElementBegin)
indexElementEnd = content.index(articleElementEnd,indexElementBegin)
entityUUID = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
cstJsonBegin = "window.__APOLLO_STATE__="
cstJsonEnd = "</script><script>"
indexBegin = content.index(cstJsonBegin)
indexBegin += len(cstJsonBegin)
indexEnd = content.index(cstJsonEnd)
raw_only = content[indexBegin:indexEnd]
json_only = json.loads(raw_only)
with open('data.json', 'w') as f:
json.dump(json_only, f)
applicationId = None
json_article = None
keyArticle = None
for key in json_only["contentService"]["ROOT_QUERY"]:
if "\"applicationId\":" in key:
keySplit=key.split("\"")
applicationId = keySplit[len(keySplit) - 2]
keyArticle=json_only["contentService"]["ROOT_QUERY"][key]["id"]
json_article=json_only["contentService"][keyArticle]
articleStrTitle = json_article["socialHeadline"]
articleStrDescription = ""
for key in json_article["summary"]["json"]:
htmlType=key["type"]
htmlContent=key["children"][0]["data"]
articleStrDescription+="<"+htmlType+">"+htmlContent+"</"+htmlType+">"
pageContent = ""
pageContent += "<meta property=\"og:type\" content=\"article\">"
pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">"
pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">"
pageContent += "<meta property=\"og:url\" content=\""+url+"\">"
#pageContent += "<meta property=\"og:image\" content=\""+articleStrImage+"\">"
pageContent += "<article>"
pageContent += "<h2>"+articleStrTitle+"</h2>\n"
pageContent += "<em>"+articleStrDescription+"</em>\n"
# Article Extraction attempt
keyArticle2=""
for key in json_article:
if "body({\"customContents\"" in key:
keyArticle2=key
#say("UUID :"+entityUUID)
#say("AppID:"+applicationId)
#say("Key :"+keyArticle)
#say("Title:"+articleStrTitle)
#say("Desc :"+articleStrDescription)
json_article2 = json_article[keyArticle2]
# ~ with open('data3.json', 'w') as f2:
# ~ json.dump(json_article2, f2)
cpt=0
for element in json_article2["json"]:
htmlType=element["type"]
# ~ print("Bef Element: "+htmlType)
if "ad1" in htmlType:
continue
elif "ad2" in htmlType:
continue
elif "ad3" in htmlType:
continue
elif "ad4" in htmlType:
continue
elif "ad5" in htmlType:
continue
elif "native-ads" in htmlType:
continue
elif "more-on-this" in htmlType:
continue
# ~ print("Aft Element: "+htmlType)
try:
htmlContent = element["children"]
except:
continue
pageContent += "<"+htmlType+">"
for elementChildren in htmlContent:
htmlTypeChildren=elementChildren["type"]
if "text" == htmlTypeChildren:
pageContent += elementChildren["data"]
elif "a" == htmlTypeChildren:
href=elementChildren["attribs"]["href"]
pageContent += "<a href=\""+href+"\" target=\"new-"+str(cpt)+"\">"
pageContent += elementChildren["children"][0]["data"]
pageContent += "</"+htmlTypeChildren+">"
elif "img" == htmlTypeChildren:
src=elementChildren["attribs"]["src"]
caption=elementChildren["attribs"]["title"]
pageContent += "<img src=\""+src+"\">"
pageContent += "<figcaption><em>"+caption+"</em></figcaption>"
try:
pageContent += elementChildren["children"][0]["data"]
except:
pass
elif "iframe" == htmlTypeChildren:
src=elementChildren["attribs"]["src"]
caption=elementChildren["attribs"]["title"]
pageContent += "<iframe src=\""+src+"\">"
try:
pageContent += elementChildren["children"][0]["data"]
except:
pass
pageContent += "</iframe>"
pageContent += "<figcaption><em><a href=\""+src+"\" target=\"new-"+str(cpt)+"\">"+caption+"</a></em></figcaption>"
elif "em" == htmlTypeChildren:
pageContent += "<"+htmlTypeChildren+">"
try:
pageContent += elementChildren["children"][0]["data"]
except:
pass
pageContent += "</"+htmlTypeChildren+">"
else:
print("OTHER : "+htmlTypeChildren)
pageContent += "</"+htmlType+">\n"
cpt+=1
pageContent+="</article>"
return pageContent