1 contributor
#!/usr/bin/env python3
# encoding: UTF-8
__author__ = 'Yanik Cawidrone'
__version__ = '0.1'
"""
For more see the file 'LICENSE' for copying permission.
"""
from .newsParser import newsDNA
from .newsParser import newsFt
from .newsParser import newsLeParisien
from .newsParser import newsLiberation
from .newsParser import newsWaPo
from .newsParser import newsZDNetFr
from .newsParser import newsSCMP
from .newsParser import newsTelerama
from .newsParser import newsCNA
from .newsParser import newsViceCom
from .newsParser import newsNewYorkTimes
from .newsParser import newsMothershipSG
from .newsParser import newsLeMonde
from .newsParser import newsChallengesFr
from .newsParser import newsJDD
from .newsParser import newsMidiLibre
from .newsParser import newsNouvelObs
from .newsParser import newsHuffPost
from .newsParser import newsStraitsTimes
from .newsParser import newsNewYorker
from .newsParser import newsLeFigaro
from .newsParser import newsSudOuest
from .newsParser import newsBBC
from .newsParser import newsTheAtlantic
from .newsParser import newsTheStarMy
from .newsParser import newsNSTMy
from .newsParser import newsLaDepeche
from .newsParser import newsTheGuardian
from .newsParser import newsBloomberg
from .newsParser import newsFranceTVInfo
from .newsParser import newsTheVerge
from .newsParser import newsBondyBlog
from .newsParser import newsFrandroidCom
from .newsParser import newsBuzzfeedCom
from .newsParser import newsYahooCom
from .newsParser import newsBFM
from .newsParser import newsDefault
from .newsParser import newsLNC
from .newsParser import newsSlateCom
# ~ from .newsParser import newsTodayOnlineSG
def supportedList():
current_module = __import__(__name__)
current_content = dir(current_module)
newsList = "<ul>\n"
for funcName in current_content:
if "__" not in funcName and "news" in funcName and "newsParser" not in funcName:
#newsList += "<li>"+funcName+"</li>\n"
newsList += "<li>"+funcName.replace("news","")+"</li>\n"
newsList += "</ul>\n"
return newsList
def articleElement(typeElement,content):
element=""
if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content:
#print("=================== Buzzfeed")
if typeElement is "title":
articleElementBegin ="\"headline\": \""
elif typeElement is "description":
articleElementBegin ="\"description\": \""
articleElementEnd ="\","
indexElementBegin = content.index(articleElementBegin)
indexElementEnd = content.index(articleElementEnd,indexElementBegin)
element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
elif "<meta property=\"og:url\" content=\"https://www.lemonde.fr/" in content:
#print("=================== Lemonde")
articleElementBegin=""
articleElementEnd ="\">"
if typeElement is "image":
articleElementBegin ="<meta property=\"og:image\" content=\"http"
indexElementBegin = content.index(articleElementBegin)
indexElementEnd = content.index(articleElementEnd,indexElementBegin)
element = "http"+content[indexElementBegin+len(articleElementBegin):indexElementEnd]
elif typeElement is "title":
articleElementBegin ="<meta property=\"og:title\" content=\""
indexElementBegin = content.index(articleElementBegin)
indexElementEnd = content.index(articleElementEnd,indexElementBegin)
element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
elif "\"nytimes.com\"" in content:
#print("=================== NewYorkTimes")
articleElementBegin ="<meta data-rh=\"true\" property=\"og:"+typeElement+"\" content=\""
articleElementEnd ="\"/>"
indexElementBegin = content.index(articleElementBegin)
indexElementEnd = content.index(articleElementEnd,indexElementBegin)
element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
else:
#print("=================== Generic")
articleElementBegin ="<meta property=\"og:"+typeElement+"\" content=\""
articleElementBegin2 ="<meta data-rh=\"true\" property=\"og:"+typeElement+"\" content=\""
articleElementEnd ="\" />"
articleElementEnd2 ="\"/>"
articleElementEnd3 ="\">"
try:
# ~ print("Begin Try: "+articleElementBegin)
indexElementBegin = content.index(articleElementBegin)
except:
try:
# ~ print("Begin Try: "+articleElementBegin2)
indexElementBegin = content.index(articleElementBegin2)
except:
indexElementBegin = 0
try:
#print("End Try: "+articleElementEnd)
indexElementEnd = content.index(articleElementEnd,indexElementBegin)
except:
try:
#print("End Try: "+articleElementEnd2)
indexElementEnd = content.index(articleElementEnd2,indexElementBegin)
except:
#print("End Try: "+articleElementEnd3)
indexElementEnd = content.index(articleElementEnd3,indexElementBegin)
element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
#print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd))
#print("Element["+element+"]")
return element
def articleTitle(content):
print("newsParser.articleTitle")
return articleElement("title",content)
def articleImage(content):
print("newsParser.articleImage")
return articleElement("image",content)
def articleDescription(content):
print("newsParser.articleDescription")
return articleElement("description",content)
def getArticle(url):
data_page = ""
if not url is None:
if "ft.com" in url:
data_page += newsFt.article(url)
elif "dna.fr" in url:
data_page += newsParser.newsDNA.article(url)
elif "washingtonpost.com" in url:
data_page += newsParser.newsWaPo.article(url)
elif "leparisien.fr" in url:
data_page += newsParser.newsLeParisien.article(url)
elif "liberation.fr" in url:
data_page += newsParser.newsLiberation.article(url)
elif "zdnet.fr" in url:
data_page += newsParser.newsZDNetFr.article(url)
elif "scmp.com" in url:
data_page += newsParser.newsSCMP.article(url)
elif "telerama.fr" in url:
data_page += newsParser.newsTelerama.article(url)
elif "channelnewsasia.com" in url:
data_page += newsParser.newsCNA.article(url)
elif "vice.com" in url:
data_page += newsParser.newsViceCom.article(url)
elif "nytimes.com" in url:
data_page += newsParser.newsNewYorkTimes.article(url)
elif "mothership.sg" in url:
data_page += newsParser.newsMothershipSG.article(url)
elif "lemonde.fr" in url:
data_page += newsParser.newsLeMonde.article(url)
elif "lejdd.fr" in url:
data_page += newsParser.newsJDD.article(url)
elif "nouvelobs.com" in url:
data_page += newsParser.newsNouvelObs.article(url)
elif "huffingtonpost." in url:
data_page += newsParser.newsHuffPost.article(url)
elif "huffpost.com" in url:
data_page += newsParser.newsHuffPost.article(url)
elif "straitstimes.com" in url:
data_page += newsParser.newsStraitsTimes.article(url)
elif "newyorker.com" in url:
data_page += newsParser.newsNewYorker.article(url)
elif "lefigaro.fr" in url:
data_page += newsParser.newsLeFigaro.article(url)
elif "sudouest.fr" in url:
data_page += newsParser.newsSudOuest.article(url)
elif "bbc.com" in url:
data_page += newsParser.newsBBC.article(url)
elif "theatlantic.com" in url:
data_page += newsParser.newsTheAtlantic.article(url)
elif "thestar.com.my" in url:
data_page += newsParser.newsTheStarMy.article(url)
elif "challenges.fr" in url:
data_page += newsParser.newsChallengesFr.article(url)
elif "depeche.fr" in url:
data_page += newsParser.newsLaDepeche.article(url)
elif "guardian.com" in url or "guardian.co.uk" in url:
data_page += newsParser.newsTheGuardian.article(url)
elif "bloomberg.com" in url:
data_page += newsParser.newsBloomberg.article(url)
elif "francetvinfo.fr" in url:
data_page += newsParser.newsFranceTVInfo.article(url)
elif "theverge.com" in url:
data_page += newsParser.newsTheVerge.article(url)
elif "bondyblog.fr" in url:
data_page += newsParser.newsBondyBlog.article(url)
elif "frandroid.com" in url:
data_page += newsParser.newsFrandroidCom.article(url)
elif "buzzfeed.com" in url or "buzzfeednews.com" in url:
data_page += newsParser.newsBuzzfeedCom.article(url)
elif "news.yahoo.com" in url or "afp.com" in url:
data_page += newsParser.newsYahooCom.article(url)
elif "bfmtv.com" in url:
data_page += newsParser.newsBFM.article(url)
elif "lnc.nc" in url:
data_page += newsParser.newsLNC.article(url)
elif "slate.com" in url:
data_page += newsParser.newsSlateCom.article(url)
else:
data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
#data_page += "<p>Supported News:"
#data_page += supportedList()
#data_page += "</p>\n"
data_page += newsParser.newsDefault.article(url)
return data_page