#!/usr/bin/env python3 # encoding: UTF-8 __author__ = 'Yanik Cawidrone' __version__ = '0.1' """ For more see the file 'LICENSE' for copying permission. """ from .newsParser import newsBBC from .newsParser import newsBFM from .newsParser import newsBondyBlog from .newsParser import newsBuzzfeedCom from .newsParser import newsChallengesFr from .newsParser import newsCNA from .newsParser import newsCourrier from .newsParser import newsDefault from .newsParser import newsDNA from .newsParser import newsFranceTVInfo from .newsParser import newsFrandroidCom from .newsParser import newsHuffPost from .newsParser import newsJDD from .newsParser import newsLaDepeche from .newsParser import newsLeFigaro from .newsParser import newsLeMonde from .newsParser import newsLeParisien from .newsParser import newsLiberation from .newsParser import newsMediapart from .newsParser import newsMidiLibre from .newsParser import newsMothershipSG from .newsParser import newsNewYorker from .newsParser import newsNouvelObs from .newsParser import newsNSTMy from .newsParser import newsSCMP from .newsParser import newsSlateCom from .newsParser import newsSlateFr from .newsParser import newsStraitsTimes from .newsParser import newsSudOuest from .newsParser import newsTelerama from .newsParser import newsTheAtlantic from .newsParser import newsTheGuardian from .newsParser import newsTheStarMy from .newsParser import newsTheVerge from .newsParser import newsViceCom from .newsParser import newsWaPo from .newsParser import newsYahooCom from .newsParser import newsZDNetFr # ~ from .newsParser import newsXXXXXX from .newsParser import accountMediapart from .newsParser import accountCourrier def supportedList(): current_module = __import__(__name__) current_content = dir(current_module) newsList = "\n" return newsList def articleElement(typeElement,content): element="" if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content: #print("=================== Buzzfeed") if typeElement == "title": articleElementBegin ="\"headline\": \"" elif typeElement == "description": articleElementBegin ="\"description\": \"" articleElementEnd ="\"," indexElementBegin = content.index(articleElementBegin) indexElementEnd = content.index(articleElementEnd,indexElementBegin) element = content[indexElementBegin+len(articleElementBegin):indexElementEnd] elif "" if typeElement == "image": articleElementBegin ="" indexElementBegin = content.index(articleElementBegin) indexElementEnd = content.index(articleElementEnd,indexElementBegin) element = content[indexElementBegin+len(articleElementBegin):indexElementEnd] else: #print("=================== Generic") articleElementBegin ="" articleElementEnd2 ="\"/>" articleElementEnd3 ="\">" try: # ~ print("Begin Try: "+articleElementBegin) indexElementBegin = content.index(articleElementBegin) except: try: # ~ print("Begin Try: "+articleElementBegin2) indexElementBegin = content.index(articleElementBegin2) except: indexElementBegin = 0 try: #print("End Try: "+articleElementEnd) indexElementEnd = content.index(articleElementEnd,indexElementBegin) except: try: #print("End Try: "+articleElementEnd2) indexElementEnd = content.index(articleElementEnd2,indexElementBegin) except: #print("End Try: "+articleElementEnd3) indexElementEnd = content.index(articleElementEnd3,indexElementBegin) element = content[indexElementBegin+len(articleElementBegin):indexElementEnd] #print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd)) #print("Element["+element+"]") return element def articleTitle(content): print("newsParser.articleTitle") return articleElement("title",content) def articleImage(content): print("newsParser.articleImage") return articleElement("image",content) def articleDescription(content): print("newsParser.articleDescription") return articleElement("description",content) def getArticle(url): data_page = "" if not url is None: if "ft.com" in url: data_page += newsFt.article(url) elif "dna.fr" in url: data_page += newsParser.newsDNA.article(url) elif "washingtonpost.com" in url: data_page += newsParser.newsWaPo.article(url) elif "leparisien.fr" in url: data_page += newsParser.newsLeParisien.article(url) elif "liberation.fr" in url: data_page += newsParser.newsLiberation.article(url) elif "zdnet.fr" in url: data_page += newsParser.newsZDNetFr.article(url) elif "scmp.com" in url: data_page += newsParser.newsSCMP.article(url) elif "telerama.fr" in url: data_page += newsParser.newsTelerama.article(url) elif "channelnewsasia.com" in url: data_page += newsParser.newsCNA.article(url) elif "vice.com" in url: data_page += newsParser.newsViceCom.article(url) elif "nytimes.com" in url: data_page += newsParser.newsNewYorkTimes.article(url) elif "mothership.sg" in url: data_page += newsParser.newsMothershipSG.article(url) elif "lemonde.fr" in url: data_page += newsParser.newsLeMonde.article(url) elif "lejdd.fr" in url: data_page += newsParser.newsJDD.article(url) elif "nouvelobs.com" in url: data_page += newsParser.newsNouvelObs.article(url) elif "huffingtonpost." in url: data_page += newsParser.newsHuffPost.article(url) elif "huffpost.com" in url: data_page += newsParser.newsHuffPost.article(url) elif "straitstimes.com" in url: data_page += newsParser.newsStraitsTimes.article(url) elif "newyorker.com" in url: data_page += newsParser.newsNewYorker.article(url) elif "lefigaro.fr" in url: data_page += newsParser.newsLeFigaro.article(url) elif "sudouest.fr" in url: data_page += newsParser.newsSudOuest.article(url) elif "bbc.com" in url: data_page += newsParser.newsBBC.article(url) elif "theatlantic.com" in url: data_page += newsParser.newsTheAtlantic.article(url) elif "thestar.com.my" in url: data_page += newsParser.newsTheStarMy.article(url) elif "challenges.fr" in url: data_page += newsParser.newsChallengesFr.article(url) elif "depeche.fr" in url: data_page += newsParser.newsLaDepeche.article(url) elif "guardian.com" in url or "guardian.co.uk" in url: data_page += newsParser.newsTheGuardian.article(url) elif "francetvinfo.fr" in url: data_page += newsParser.newsFranceTVInfo.article(url) elif "theverge.com" in url: data_page += newsParser.newsTheVerge.article(url) elif "bondyblog.fr" in url: data_page += newsParser.newsBondyBlog.article(url) elif "frandroid.com" in url: data_page += newsParser.newsFrandroidCom.article(url) elif "buzzfeed.com" in url or "buzzfeednews.com" in url: data_page += newsParser.newsBuzzfeedCom.article(url) elif "news.yahoo.com" in url or "afp.com" in url: data_page += newsParser.newsYahooCom.article(url) elif "bfmtv.com" in url: data_page += newsParser.newsBFM.article(url) elif "lnc.nc" in url: data_page += newsParser.newsLNC.article(url) elif "slate.com" in url: data_page += newsParser.newsSlateCom.article(url) elif "slate.fr" in url: data_page += newsParser.newsSlateFr.article(url) elif "mediapart.fr" in url: data_page += newsParser.newsMediapart.article(url) elif "courrierinternational.com" in url: data_page += newsParser.newsCourrier.article(url) else: data_page += "

Generic Extraction, click to open original link

\n" #data_page += "

Supported News:" #data_page += supportedList() #data_page += "

\n" data_page += newsParser.newsDefault.article(url) return data_page