Added parsers ・ 3b3c9f4 ・ Gitprep

- Added parsers;
- Browse files
- ycawidro commited on 2021-10-05
- 1 parent a536863
  
  commit 3b3c9f4ff11553d7b77680a456d4400af84b2c8a

Showing 37 changed files with 2271 additions and 0 deletions

+213

newsParser/__init__.py

...	...	@@ -0,0 +1,213 @@
	1	+#!/usr/bin/env python3
	2	+# encoding: UTF-8
	3	+__author__ = 'Yanik Cawidrone'
	4	+__version__ = '0.1'
	5	+
	6	+"""
	7	+ For more see the file 'LICENSE' for copying permission.
	8	+"""
	9	+
	10	+from .newsParser import newsDNA
	11	+from .newsParser import newsFt
	12	+from .newsParser import newsLeParisien
	13	+from .newsParser import newsLiberation
	14	+from .newsParser import newsWaPo
	15	+from .newsParser import newsZDNetFr
	16	+from .newsParser import newsSCMP
	17	+from .newsParser import newsTelerama
	18	+from .newsParser import newsCNA
	19	+from .newsParser import newsViceCom
	20	+from .newsParser import newsNewYorkTimes
	21	+from .newsParser import newsMothershipSG
	22	+from .newsParser import newsLeMonde
	23	+from .newsParser import newsChallengesFr
	24	+from .newsParser import newsJDD
	25	+from .newsParser import newsMidiLibre
	26	+from .newsParser import newsNouvelObs
	27	+from .newsParser import newsHuffPost
	28	+from .newsParser import newsStraitsTimes
	29	+from .newsParser import newsNewYorker
	30	+from .newsParser import newsLeFigaro
	31	+from .newsParser import newsSudOuest
	32	+from .newsParser import newsBBC
	33	+from .newsParser import newsTheAtlantic
	34	+from .newsParser import newsTheStarMy
	35	+from .newsParser import newsNSTMy
	36	+from .newsParser import newsLaDepeche
	37	+from .newsParser import newsTheGuardian
	38	+from .newsParser import newsBloomberg
	39	+from .newsParser import newsFranceTVInfo
	40	+from .newsParser import newsTheVerge
	41	+from .newsParser import newsBondyBlog
	42	+from .newsParser import newsFrandroidCom
	43	+from .newsParser import newsBuzzfeedCom
	44	+from .newsParser import newsYahooCom
	45	+from .newsParser import newsBFM
	46	+# ~ from .newsParser import newsTodayOnlineSG
	47	+
	48	+def supportedList():
	49	+ current_module = __import__(__name__)
	50	+ current_content = dir(current_module)
	51	+ newsList = "<ul>\n"
	52	+ for funcName in current_content:
	53	+ if "__" not in funcName and "news" in funcName and "newsParser" not in funcName:
	54	+ #newsList += "<li>"+funcName+"</li>\n"
	55	+ newsList += "<li>"+funcName.replace("news","")+"</li>\n"
	56	+ newsList += "</ul>\n"
	57	+ return newsList
	58	+
	59	+def articleElement(typeElement,content):
	60	+ element=""
	61	+ if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content:
	62	+ #print("=================== Buzzfeed")
	63	+ if typeElement is "title":
	64	+ articleElementBegin ="\"headline\": \""
	65	+ elif typeElement is "description":
	66	+ articleElementBegin ="\"description\": \""
	67	+ articleElementEnd ="\","
	68	+ indexElementBegin = content.index(articleElementBegin)
	69	+ indexElementEnd = content.index(articleElementEnd,indexElementBegin)
	70	+ element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
	71	+ elif "<meta property=\"og:url\" content=\"https://www.lemonde.fr/" in content:
	72	+ #print("=================== Lemonde")
	73	+ articleElementBegin=""
	74	+ articleElementEnd ="\">"
	75	+ if typeElement is "image":
	76	+ articleElementBegin ="<meta property=\"og:image\" content=\"http"
	77	+ indexElementBegin = content.index(articleElementBegin)
	78	+ indexElementEnd = content.index(articleElementEnd,indexElementBegin)
	79	+ element = "http"+content[indexElementBegin+len(articleElementBegin):indexElementEnd]
	80	+ elif typeElement is "title":
	81	+ articleElementBegin ="<meta property=\"og:title\" content=\""
	82	+ indexElementBegin = content.index(articleElementBegin)
	83	+ indexElementEnd = content.index(articleElementEnd,indexElementBegin)
	84	+ element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
	85	+ elif "\"nytimes.com\"" in content:
	86	+ #print("=================== NewYorkTimes")
	87	+ articleElementBegin ="<meta data-rh=\"true\" property=\"og:"+typeElement+"\" content=\""
	88	+ articleElementEnd ="\"/>"
	89	+ indexElementBegin = content.index(articleElementBegin)
	90	+ indexElementEnd = content.index(articleElementEnd,indexElementBegin)
	91	+ element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
	92	+ else:
	93	+ #print("=================== Generic")
	94	+ articleElementBegin ="<meta property=\"og:"+typeElement+"\" content=\""
	95	+ articleElementBegin2 ="<meta data-rh=\"true\" property=\"og:"+typeElement+"\" content=\""
	96	+ articleElementEnd ="\" />"
	97	+ articleElementEnd2 ="\"/>"
	98	+ articleElementEnd3 ="\">"
	99	+ try:
	100	+ # ~ print("Begin Try: "+articleElementBegin)
	101	+ indexElementBegin = content.index(articleElementBegin)
	102	+ except:
	103	+ try:
	104	+ # ~ print("Begin Try: "+articleElementBegin2)
	105	+ indexElementBegin = content.index(articleElementBegin2)
	106	+ except:
	107	+ indexElementBegin = 0
	108	+ try:
	109	+ print("End Try: "+articleElementEnd)
	110	+ indexElementEnd = content.index(articleElementEnd,indexElementBegin)
	111	+ except:
	112	+ try:
	113	+ print("End Try: "+articleElementEnd2)
	114	+ indexElementEnd = content.index(articleElementEnd2,indexElementBegin)
	115	+ except:
	116	+ print("End Try: "+articleElementEnd3)
	117	+ indexElementEnd = content.index(articleElementEnd3,indexElementBegin)
	118	+ element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
	119	+ #print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd))
	120	+ #print("Element["+element+"]")
	121	+ return element
	122	+
	123	+def articleTitle(content):
	124	+ print("newsParser.articleTitle")
	125	+ return articleElement("title",content)
	126	+
	127	+def articleImage(content):
	128	+ print("newsParser.articleImage")
	129	+ return articleElement("image",content)
	130	+
	131	+def articleDescription(content):
	132	+ print("newsParser.articleDescription")
	133	+ return articleElement("description",content)
	134	+
	135	+def getArticle(url):
	136	+ data_page = ""
	137	+ if not url is None:
	138	+ if "ft.com" in url:
	139	+ data_page += newsFt.article(url)
	140	+ elif "dna.fr" in url:
	141	+ data_page += newsParser.newsDNA.article(url)
	142	+ elif "washingtonpost.com" in url:
	143	+ data_page += newsParser.newsWaPo.article(url)
	144	+ elif "leparisien.fr" in url:
	145	+ data_page += newsParser.newsLeParisien.article(url)
	146	+ elif "liberation.fr" in url:
	147	+ data_page += newsParser.newsLiberation.article(url)
	148	+ elif "zdnet.fr" in url:
	149	+ data_page += newsParser.newsZDNetFr.article(url)
	150	+ elif "scmp.com" in url:
	151	+ data_page += newsParser.newsSCMP.article(url)
	152	+ elif "telerama.fr" in url:
	153	+ data_page += newsParser.newsTelerama.article(url)
	154	+ elif "channelnewsasia.com" in url:
	155	+ data_page += newsParser.newsCNA.article(url)
	156	+ elif "vice.com" in url:
	157	+ data_page += newsParser.newsViceCom.article(url)
	158	+ elif "nytimes.com" in url:
	159	+ data_page += newsParser.newsNewYorkTimes.article(url)
	160	+ elif "mothership.sg" in url:
	161	+ data_page += newsParser.newsMothershipSG.article(url)
	162	+ elif "lemonde.fr" in url:
	163	+ data_page += newsParser.newsLeMonde.article(url)
	164	+ elif "lejdd.fr" in url:
	165	+ data_page += newsParser.newsJDD.article(url)
	166	+ elif "nouvelobs.com" in url:
	167	+ data_page += newsParser.newsNouvelObs.article(url)
	168	+ elif "huffingtonpost." in url:
	169	+ data_page += newsParser.newsHuffPost.article(url)
	170	+ elif "huffpost.com" in url:
	171	+ data_page += newsParser.newsHuffPost.article(url)
	172	+ elif "straitstimes.com" in url:
	173	+ data_page += newsParser.newsStraitsTimes.article(url)
	174	+ elif "newyorker.com" in url:
	175	+ data_page += newsParser.newsNewYorker.article(url)
	176	+ elif "lefigaro.fr" in url:
	177	+ data_page += newsParser.newsLeFigaro.article(url)
	178	+ elif "sudouest.fr" in url:
	179	+ data_page += newsParser.newsSudOuest.article(url)
	180	+ elif "bbc.com" in url:
	181	+ data_page += newsParser.newsBBC.article(url)
	182	+ elif "theatlantic.com" in url:
	183	+ data_page += newsParser.newsTheAtlantic.article(url)
	184	+ elif "thestar.com.my" in url:
	185	+ data_page += newsParser.newsTheStarMy.article(url)
	186	+ elif "challenges.fr" in url:
	187	+ data_page += newsParser.newsChallengesFr.article(url)
	188	+ elif "depeche.fr" in url:
	189	+ data_page += newsParser.newsLaDepeche.article(url)
	190	+ elif "guardian.com" in url or "guardian.co.uk" in url:
	191	+ data_page += newsParser.newsTheGuardian.article(url)
	192	+ elif "bloomberg.com" in url:
	193	+ data_page += newsParser.newsBloomberg.article(url)
	194	+ elif "francetvinfo.fr" in url:
	195	+ data_page += newsParser.newsFranceTVInfo.article(url)
	196	+ elif "theverge.com" in url:
	197	+ data_page += newsParser.newsTheVerge.article(url)
	198	+ elif "bondyblog.fr" in url:
	199	+ data_page += newsParser.newsBondyBlog.article(url)
	200	+ elif "frandroid.com" in url:
	201	+ data_page += newsParser.newsFrandroidCom.article(url)
	202	+ elif "buzzfeed.com" in url or "buzzfeednews.com" in url:
	203	+ data_page += newsParser.newsBuzzfeedCom.article(url)
	204	+ elif "news.yahoo.com" in url or "afp.com" in url:
	205	+ data_page += newsParser.newsYahooCom.article(url)
	206	+ elif "bfmtv.com" in url:
	207	+ data_page += newsParser.newsBFM.article(url)
	208	+ else:
	209	+ data_page += "<p>Unsupported News, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
	210	+ data_page += "<p>Supported News:"
	211	+ data_page += supportedList()
	212	+ data_page += "</p>\n"
	213	+ return data_page

+53

newsParser/newsParser/newsBBC.py

View

...	...	@@ -0,0 +1,53 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+
	5	+def article(url):
	6	+ say("Article: "+url)
	7	+ url = url.replace("www.midilibre.fr","www.midilibre.fr/amp")
	8	+ r = requests.get(url, allow_redirects=True)
	9	+ content = r.text
	10	+
	11	+ articleStrImageUrl = newsParser.articleImage(content)
	12	+ articleStrTitle = newsParser.articleTitle(content)
	13	+ articleStrDescription = newsParser.articleDescription(content)
	14	+
	15	+ pageContent = ""
	16	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	17	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	18	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	19	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	20	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	21	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	22	+
	23	+ articleCstBegin = "<article "
	24	+ articleCstEnd = "<div class=\"article-full__footer\">"
	25	+ articleCstEnd2 = "<section data-component=\"tag-list\""
	26	+ articleCstEnd3 = "</article>"
	27	+ indexBegin = content.index(articleCstBegin)
	28	+ try:
	29	+ indexEnd = content.index(articleCstEnd)
	30	+ except:
	31	+ try:
	32	+ indexEnd = content.index(articleCstEnd2)
	33	+ except:
	34	+ indexEnd = content.index(articleCstEnd3)
	35	+ article_only = content[indexBegin:indexEnd]
	36	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	37	+ article_only = re.sub(r"</amp-img>", '', article_only)
	38	+ article_only = re.sub(r"<h2", '<h3', article_only)
	39	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	40	+ article_only = re.sub(r"<h1", '<h2', article_only)
	41	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	42	+ article_only = re.sub(r"<div id=\"share-tools-panel\" (.*?)>Share page</div>", '', article_only)
	43	+ article_only = re.sub(r"<a href=\"(.?)\" class=\"(.?)\">About sharing</a>", '', article_only)
	44	+ article_only = article_only.replace("><", ">\n<")
	45	+ article_only = re.sub(r"<span class=\"(.?)-VisuallyHidden (.?)\">image copyright</span>", '', article_only)
	46	+ article_only = re.sub(r"<span class=\"(.?)-VisuallyHidden (.?)\">image caption</span>", '', article_only)
	47	+ article_only = re.sub(r"<noscript>", '', article_only,re.MULTILINE)
	48	+ article_only = re.sub(r"</noscript>", '', article_only,re.MULTILINE)
	49	+ article_only = re.sub(r"<div class=\"(.?)-TagShareWrapper (.?)\">", '<div style="display: none;">', article_only,re.MULTILINE)
	50	+
	51	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.bbc.com/', article_only)
	52	+ pageContent += "<article>"+article_only+"</article>"
	53	+ return pageContent

+45

newsParser/newsParser/newsBFM.py

View

...	...	@@ -0,0 +1,45 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ url = url.replace("dna.fr/","dna.fr/amp/")
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+ pageContent = ""
	12	+ articleStrTitle = newsParser.articleTitle(content)
	13	+ articleStrImageUrl = newsParser.articleImage(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+ articleCstBegin = "<div class=\"content_body\">"
	16	+ articleCstEnd = "<div class=\"content_body\" id=\"content_body_bottom\">"
	17	+
	18	+ pageContent += "<h2>"+articleStrTitle+"</h2>\n"
	19	+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
	20	+ pageContent += "<em>"+articleStrDescription+"</em>\n"
	21	+
	22	+
	23	+ pageContent = ""
	24	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	25	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	26	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	27	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	28	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	29	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	30	+
	31	+ indexBegin = content.index(articleCstBegin)
	32	+ indexEnd = content.index(articleCstEnd)
	33	+ article_only = ""
	34	+ article_only = content[indexBegin:indexEnd]
	35	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	36	+ article_only = re.sub(r"</amp-img>", '', article_only)
	37	+ article_only = re.sub(r"<h2", '<h3', article_only)
	38	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	39	+ article_only = re.sub(r"<h1", '<h2', article_only)
	40	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	41	+
	42	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.bfmtv.com/', article_only)
	43	+ pageContent += "<article>"+article_only+"</article>"
	44	+ pageContent = pageContent.replace("><", ">\n<")
	45	+ return pageContent

+53

newsParser/newsParser/newsBloomberg.py

View

...	...	@@ -0,0 +1,53 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
	9	+ content = r.text
	10	+ articleCstBegin = "<div class=\"article-content\">"
	11	+ articleCstBegin2 = "<time class=\"article-timestamp\""
	12	+ articleCstEnd = "<div class=\"bottom-left-rail-touts-spacer\">"
	13	+ try:
	14	+ indexBegin = content.index(articleCstBegin)
	15	+ except:
	16	+ try:
	17	+ indexBegin = content.index(articleCstBegin2)
	18	+ except:
	19	+ indexBegin = 0
	20	+ try:
	21	+ indexEnd = content.index(articleCstEnd)
	22	+ except:
	23	+ indexEnd = 0
	24	+ articleStrImageUrl = newsParser.articleImage(content)
	25	+ articleStrTitle = newsParser.articleTitle(content)
	26	+ articleStrDescription = newsParser.articleDescription(content)
	27	+
	28	+ pageContent = ""
	29	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	30	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	31	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	32	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	33	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	34	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	35	+
	36	+ article_only = ""
	37	+ article_only += "<h2>"+articleStrTitle+"</h2>\n"
	38	+ article_only += "<em>"+articleStrDescription+"</em>\n"
	39	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	40	+ article_only += content[indexBegin:indexEnd]
	41	+ article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
	42	+ article_only = re.sub(r"<h2", '<h3', article_only)
	43	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	44	+ article_only = re.sub(r"<h1", '<h2', article_only)
	45	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	46	+ article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
	47	+ article_only = re.sub(r"<picture><source media=\"(.?)\" srcSet=\"(.?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
	48	+ article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
	49	+ article_only = article_only.replace("><", ">\n<")
	50	+
	51	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
	52	+ pageContent += "<article>"+article_only+"</article>"
	53	+ return pageContent

+45

newsParser/newsParser/newsBondyBlog.py

View

...	...	@@ -0,0 +1,45 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+
	5	+def article(url):
	6	+ say("Article: "+url)
	7	+ r = requests.get(url, allow_redirects=True)
	8	+ content = r.text
	9	+
	10	+ articleStrImageUrl = newsParser.articleImage(content)
	11	+ articleStrTitle = newsParser.articleTitle(content)
	12	+ articleStrDescription = newsParser.articleDescription(content)
	13	+
	14	+ pageContent = ""
	15	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	16	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	17	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	18	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	19	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	20	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	21	+
	22	+ articleCstBegin = "<section class=\"pageHeader\">"
	23	+ articleCstEnd = "<section class=\"pageComponents\">"
	24	+ articleCstEnd2 = "<section subscriptions-section=\"content-not-granted\">"
	25	+ articleCstEnd3 = "</article>"
	26	+ indexBegin = content.index(articleCstBegin)
	27	+ try:
	28	+ indexEnd = content.index(articleCstEnd)
	29	+ except:
	30	+ try:
	31	+ indexEnd = content.index(articleCstEnd2)
	32	+ except:
	33	+ indexEnd = content.index(articleCstEnd3)
	34	+ article_only = content[indexBegin:indexEnd]
	35	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	36	+ article_only = re.sub(r"</amp-img>", '', article_only)
	37	+ article_only = re.sub(r"<h2", '<h3', article_only)
	38	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	39	+ article_only = re.sub(r"<h1", '<h2', article_only)
	40	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	41	+ article_only = article_only.replace("><", ">\n<")
	42	+
	43	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.bondyblog.fr/', article_only)
	44	+ pageContent += "<article>"+article_only+"</article>"
	45	+ return pageContent

+68

newsParser/newsParser/newsBuzzfeedCom.py

View

...	...	@@ -0,0 +1,68 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+
	7	+def article(url):
	8	+ say("Article: "+url)
	9	+ if not "/amphtml" in url:
	10	+ say("Trying AMP")
	11	+ url = url.replace("buzzfeednews.com/article","buzzfeednews.com/amphtml")
	12	+ url = url.replace("buzzfeed.com/","buzzfeed.com/amphtml/")
	13	+ url.replace("?origin=web-hf","")
	14	+
	15	+ r = requests.get(url, allow_redirects=True)
	16	+ content = r.text
	17	+ pageContent = ""
	18	+ articleCstBegin = "<article "
	19	+ articleCstEnd = "<div class=\"subbuzz subbuzz-bfp\">"
	20	+ articleCstEnd2 = "</article>"
	21	+ articleCstEnd3 = "<div class=\"shares shares--inline"
	22	+ articleStrTitle = newsParser.articleTitle(content)
	23	+ articleStrDescription = newsParser.articleDescription(content)
	24	+ articleStrImageUrl = newsParser.articleImage(content)
	25	+
	26	+ indexBegin = content.index(articleCstBegin)
	27	+ try:
	28	+ indexEnd = content.index(articleCstEnd)
	29	+ except:
	30	+ try:
	31	+ indexEnd = content.index(articleCstEnd2)
	32	+ except:
	33	+ indexEnd = content.index(articleCstEnd3)
	34	+
	35	+ pageContent = ""
	36	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	37	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	38	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	39	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	40	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	41	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	42	+ article_only = ""
	43	+ article_only += "<h1>"+articleStrTitle+"</h1>\n"
	44	+ article_only += "<em>"+articleStrDescription+"</em>\n"
	45	+ article_only += content[indexBegin:indexEnd]
	46	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	47	+ article_only = re.sub(r"</amp-img>", '', article_only)
	48	+ article_only = re.sub(r"<h2", '<h3', article_only)
	49	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	50	+ article_only = re.sub(r"<h1", '<h2', article_only)
	51	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	52	+
	53	+ # ~ article_only = re.sub(r"<picture><source media=\"(.?)\" srcSet=\"(.?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
	54	+ article_only = re.sub(r"<amp-social-share (.*?)>", "<amp-social-share>", article_only)
	55	+ article_only = re.sub(r"<span class=\"icon icon--primary flex\">", "<span>", article_only)
	56	+ article_only = re.sub(r"<title>(.*?)</title>", "", article_only)
	57	+ article_only = re.sub(r"<use xlink:href=\"(.*?)\">", "<use>", article_only)
	58	+ article_only = re.sub(r"<svg class=\"svg-(.*?)\">", "<svg height=\"1px\">", article_only)
	59	+ article_only = re.sub(r"Share on Facebook", "", article_only)
	60	+ article_only = re.sub(r"Share on Pinterest", "", article_only)
	61	+ article_only = article_only.replace("><", ">\n<")
	62	+
	63	+ if "buzzfeed.com" in url:
	64	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.buzzfeed.com/', article_only)
	65	+ elif "buzzfeednews.com" in url:
	66	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.buzzfeednews.com/', article_only)
	67	+ pageContent += "<article>"+article_only+"</article>"
	68	+ return pageContent

+59

newsParser/newsParser/newsCNA.py

View

...	...	@@ -0,0 +1,59 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleImage(content):
	7	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	8	+ articleImgEnd ="\">"
	9	+ indexImgBegin = content.index(articleImgBegin)
	10	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	11	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ return image
	13	+
	14	+def article(url):
	15	+ say("Article: "+url)
	16	+ r = requests.get(url, allow_redirects=True)
	17	+ content = r.text
	18	+ pageContent = ""
	19	+ articleCstBegin = "<article"
	20	+ articleCstEnd = "<footer class=\"article__footer\">"
	21	+ indexBegin = content.index(articleCstBegin)
	22	+ indexEnd = content.index(articleCstEnd)
	23	+ articleStrImageUrl = newsParser.articleImage(content)
	24	+
	25	+ articleStrTitle = newsParser.articleTitle(content)
	26	+ articleStrDescription = newsParser.articleDescription(content)
	27	+
	28	+ pageContent = ""
	29	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	30	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	31	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	32	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	33	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	34	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	35	+
	36	+ article_only = content[indexBegin:indexEnd]
	37	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	38	+ article_only = re.sub(r"</amp-img>", '', article_only)
	39	+ article_only = re.sub(r"<h2", '<h3', article_only)
	40	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	41	+ article_only = re.sub(r"<h1", '<h2', article_only)
	42	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	43	+ newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">"
	44	+ article_only = re.sub(r"<div class=\"article__author-link\">", newImage, article_only)
	45	+ article_only = re.sub(r"<span class=\"advertisement__title\">Advertisement</span>", '', article_only)
	46	+
	47	+ article_only = re.sub(r"class=\"picture__image lazyload\"", '', article_only)
	48	+ article_only = re.sub(r"<a class=\"addthis_button(.*)</a>", '', article_only)
	49	+ article_only = re.sub(r"<div class=\"c-sharing--default is-article-top-position\"", '<div class="c-sharing--default is-article-top-position" style="display:none"', article_only)
	50	+ article_only = re.sub(r"<h3 class=\"save-for-later__title\">Bookmark</h3>", '', article_only)
	51	+ article_only = re.sub(r"<h2", '<h3', article_only)
	52	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	53	+ article_only = re.sub(r"<h1", '<h2', article_only)
	54	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	55	+ article_only = article_only.replace("><", ">\n<")
	56	+
	57	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.channelnewsasia.com/', article_only)
	58	+ pageContent += "<article>"+article_only+"</article>"
	59	+ return pageContent

+55

newsParser/newsParser/newsChallengesFr.py

View

...	...	@@ -0,0 +1,55 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleAbonnes(content):
	7	+ articleAbonnes = "réservé aux abonnés"
	8	+ articleType = ""
	9	+ try:
	10	+ indexAbonnes = content.index(articleAbonnes)
	11	+ articleType = "Abonnés"
	12	+ except:
	13	+ articleType = ""
	14	+ return articleType
	15	+
	16	+def article(url):
	17	+ say("Article: "+url)
	18	+ r = requests.get(url, allow_redirects=True)
	19	+ content = r.text
	20	+ articleStrImageUrl = articleImage(content)
	21	+ articleStrTitle = articleTitle(content)
	22	+ articleStrImageUrl = newsParser.articleImage(content)
	23	+ articleStrTitle = newsParser.articleTitle(content)
	24	+ articleStrType = articleAbonnes(content)
	25	+
	26	+ pageContent = ""
	27	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	28	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	29	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	30	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	31	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	32	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	33	+ pageContent += "<h2>"+articleStrTitle+"</h2>\n"
	34	+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
	35	+
	36	+ articleCstBegin = "<article "
	37	+ articleCstEnd = "<div id=\"poool-widget\">"
	38	+ articleCstEnd2 = "</article>"
	39	+ indexBegin = content.index(articleCstBegin)
	40	+ try:
	41	+ indexEnd = content.index(articleCstEnd)
	42	+ except:
	43	+ indexEnd = content.index(articleCstEnd2)
	44	+ article_only = content[indexBegin:indexEnd]
	45	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	46	+ article_only = re.sub(r"</amp-img>", '', article_only)
	47	+ article_only = re.sub(r"<h2", '<h3', article_only)
	48	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	49	+ article_only = re.sub(r"<h1", '<h2', article_only)
	50	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	51	+
	52	+ article_only = re.sub(r"href=\"\/", 'href=\"//wwww.liberation.fr/', article_only)
	53	+ pageContent += article_only
	54	+ pageContent += "<p>"+articleStrType+"</p>"
	55	+ return pageContent

+38

newsParser/newsParser/newsDNA.py

View

...	...	@@ -0,0 +1,38 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ url = url.replace("dna.fr/","dna.fr/amp/")
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+
	12	+ articleStrImageUrl = newsParser.articleImage(content)
	13	+ articleStrTitle = newsParser.articleTitle(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+
	16	+ pageContent = ""
	17	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	18	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	19	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	20	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	21	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	22	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	23	+
	24	+ articleCstBegin = "<section poool-access-content amp-access=\"access\" amp-access-hide>"
	25	+ articleCstEnd = "<section amp-access=\"NOT error AND NOT access\" id=\"poool\">"
	26	+ indexBegin = content.index(articleCstBegin)
	27	+ indexEnd = content.index(articleCstEnd)
	28	+ article_only = content[indexBegin:indexEnd]
	29	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	30	+ article_only = re.sub(r"</amp-img>", '', article_only)
	31	+ article_only = re.sub(r"<h2", '<h3', article_only)
	32	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	33	+ article_only = re.sub(r"<h1", '<h2', article_only)
	34	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	35	+
	36	+ article_only = re.sub(r"href=\"\/", 'href=\"//dna.fr/', article_only)
	37	+ pageContent += "<article>"+article_only+"</article>"
	38	+ return pageContent

+43

newsParser/newsParser/newsFranceTVInfo.py

View

...	...	@@ -0,0 +1,43 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ url = url.replace(".html",".amp")
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+
	12	+ articleStrImageUrl = newsParser.articleImage(content)
	13	+ articleStrTitle = newsParser.articleTitle(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+
	16	+ pageContent = ""
	17	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	18	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	19	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	20	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	21	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	22	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	23	+
	24	+ articleCstBegin = "<article "
	25	+ articleCstEnd = "<section class=\"social-zone\">"
	26	+ articleCstEnd2 = "</article"
	27	+ indexBegin = content.index(articleCstBegin)
	28	+ try:
	29	+ indexEnd = content.index(articleCstEnd)
	30	+ except:
	31	+ indexEnd = content.index(articleCstEnd2)
	32	+ article_only = content[indexBegin:indexEnd]
	33	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	34	+ article_only = re.sub(r"</amp-img>", '', article_only)
	35	+ article_only = re.sub(r"<h2", '<h3', article_only)
	36	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	37	+ article_only = re.sub(r"<h1", '<h2', article_only)
	38	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	39	+
	40	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.francetvinfo.fr/', article_only)
	41	+ article_only = re.sub(r"src=\"\/", 'src=\"//www.francetvinfo.fr/', article_only)
	42	+ pageContent += "<article>"+article_only+"</article>"
	43	+ return pageContent

+48

newsParser/newsParser/newsFrandroidCom.py

View

...	...	@@ -0,0 +1,48 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ r = requests.get(url, allow_redirects=True)
	9	+ content = r.text
	10	+ articleCstBegin = "<div class=\"article-content"
	11	+ articleCstEnd = " <p class=\"title\">"
	12	+ articleCstEnd2 = "<div class=\"article-footer"
	13	+ articleCstEnd3 = "</article>"
	14	+ articleStrTitle = newsParser.articleTitle(content)
	15	+ articleStrDescription = newsParser.articleDescription(content)
	16	+ articleStrImageUrl = newsParser.articleImage(content)
	17	+
	18	+
	19	+ pageContent = ""
	20	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	21	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	22	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	23	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	24	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	25	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	26	+
	27	+ indexBegin = content.index(articleCstBegin)
	28	+ try:
	29	+ indexEnd = content.index(articleCstEnd)
	30	+ except:
	31	+ try:
	32	+ indexEnd = content.index(articleCstEnd2)
	33	+ except:
	34	+ indexEnd = content.index(articleCstEnd3)
	35	+ article_only = ""
	36	+ article_only += "<h2>"+articleStrTitle+"</h2>\n"
	37	+ article_only += content[indexBegin:indexEnd]
	38	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	39	+ article_only = re.sub(r"</amp-img>", '', article_only)
	40	+ article_only = re.sub(r"<h2", '<h3', article_only)
	41	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	42	+ article_only = re.sub(r"<h1", '<h2', article_only)
	43	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	44	+ article_only = article_only.replace("><", ">\n<")
	45	+
	46	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.frandroid.com/', article_only)
	47	+ pageContent += article_only
	48	+ return pageContent

+36

newsParser/newsParser/newsFt.py

View

...	...	@@ -0,0 +1,36 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ url = url.replace("www","amp")
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+
	12	+ articleStrImageUrl = newsParser.articleImage(content)
	13	+ articleStrTitle = newsParser.articleTitle(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+
	16	+ pageContent = ""
	17	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	18	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	19	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	20	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	21	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	22	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	23	+
	24	+ articleCstBegin = "<article"
	25	+ articleCstEnd = "</article>"
	26	+ indexBegin = content.index(articleCstBegin)
	27	+ indexEnd = content.index(articleCstEnd)
	28	+ article_only = content[indexBegin:indexEnd]
	29	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	30	+ article_only = re.sub(r"</amp-img>", '', article_only)
	31	+ article_only = re.sub(r"<h2", '<h3', article_only)
	32	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	33	+ article_only = re.sub(r"<h1", '<h2', article_only)
	34	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	35	+ pageContent += "<article>"+article_only+"</article>"
	36	+ return pageContent

+46

newsParser/newsParser/newsHuffPost.py

View

...	...	@@ -0,0 +1,46 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
	9	+ content = r.text
	10	+
	11	+ articleStrImageUrl = newsParser.articleImage(content)
	12	+ articleStrTitle = newsParser.articleTitle(content)
	13	+ articleStrDescription = newsParser.articleDescription(content)
	14	+
	15	+ pageContent = ""
	16	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	17	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	18	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	19	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	20	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	21	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	22	+
	23	+ articleCstBegin = "<article"
	24	+ articleCstEnd = "<div class=\"related-entries"
	25	+ articleCstEnd2 = "</article>"
	26	+ indexBegin = content.index(articleCstBegin)
	27	+ try:
	28	+ indexEnd = content.index(articleCstEnd)
	29	+ except:
	30	+ indexEnd = content.index(articleCstEnd2)
	31	+ article_only = content[indexBegin:indexEnd]
	32	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	33	+ article_only = re.sub(r"</amp-img>", '', article_only)
	34	+ article_only = re.sub(r"<h2", '<h3', article_only)
	35	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	36	+ article_only = re.sub(r"<h1", '<h2', article_only)
	37	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	38	+ article_only = re.sub(r"<a class=\"share(.?)\" data-social-name=\"(.?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
	39	+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
	40	+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
	41	+ article_only = re.sub(r"<div class=\"(.?) share-bar(.?)>",'<div style="display:none;">', article_only)
	42	+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
	43	+ article_only = article_only.replace("><", ">\n<")
	44	+
	45	+ pageContent += "<article>"+article_only+"</article>"
	46	+ return pageContent

+56

newsParser/newsParser/newsJDD.py

View

...	...	@@ -0,0 +1,56 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+
	7	+def articleAbonnes(content):
	8	+ articleAbonnes = "ABONNÉS"
	9	+ articleType = ""
	10	+ try:
	11	+ indexAbonnes = content.index(articleAbonnes)
	12	+ articleType = "Abonnés"
	13	+ except:
	14	+ articleType = ""
	15	+ return articleType
	16	+
	17	+def article(url):
	18	+ say("Article: "+url)
	19	+ r = requests.get(url, allow_redirects=True)
	20	+ content = r.text
	21	+ articleStrType = articleAbonnes(content)
	22	+
	23	+ articleStrImageUrl = newsParser.articleImage(content)
	24	+ articleStrTitle = newsParser.articleTitle(content)
	25	+ articleStrDescription = newsParser.articleDescription(content)
	26	+
	27	+ pageContent = ""
	28	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	29	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	30	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	31	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	32	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	33	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	34	+
	35	+ articleCstBegin = "<article "
	36	+ articleCstEnd = "</article>"
	37	+ indexBegin = content.index(articleCstBegin)
	38	+ indexEnd = content.index(articleCstEnd)
	39	+ article_only = content[indexBegin:indexEnd]
	40	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	41	+ article_only = re.sub(r"</amp-img>", '', article_only)
	42	+ article_only = re.sub(r"<h2", '<h3', article_only)
	43	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	44	+ article_only = re.sub(r"<h1", '<h2', article_only)
	45	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	46	+ article_only = re.sub(r"<a href=(.*?) id=\"fb_socialPageLink\" class=\"icon-Facebook\">", '<a href="">', article_only)
	47	+ article_only = re.sub(r"<a href=(.*?) id=\"tw_socialPageLink\" class=\"icon-Twitter\">", '<a href="">', article_only)
	48	+ article_only = re.sub(r"target=\"_self\"", 'target="new"', article_only)
	49	+ article_only = re.sub(r"<div class=\"nota col-md-4\">Partager sur :</div>", '', article_only)
	50	+ article_only = re.sub(r"<span class=\"hide\">\"</span>", '', article_only)
	51	+ article_only = article_only.replace("><", ">\n<")
	52	+
	53	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.lejdd.fr/', article_only)
	54	+ pageContent += "<article>"+article_only+"</article>"
	55	+ pageContent += "<p>"+articleStrType+"</p>"
	56	+ return pageContent

+49

newsParser/newsParser/newsLaDepeche.py

View

...	...	@@ -0,0 +1,49 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ if not "/amp" in url:
	9	+ say("Trying AMP")
	10	+ url = url.replace("www.ladepeche.fr","www.ladepeche.fr/amp")
	11	+ r = requests.get(url, allow_redirects=True)
	12	+ content = r.text
	13	+
	14	+ articleStrImageUrl = newsParser.articleImage(content)
	15	+ articleStrTitle = newsParser.articleTitle(content)
	16	+ articleStrDescription = newsParser.articleDescription(content)
	17	+
	18	+ pageContent = ""
	19	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	20	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	21	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	22	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	23	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	24	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	25	+
	26	+ articleCstBegin = "<article "
	27	+ articleCstEnd = "<div class=\"article-full__footer\">"
	28	+ articleCstEnd2 = "<section subscriptions-section=\"content-not-granted\">"
	29	+ articleCstEnd3 = "</article>"
	30	+ indexBegin = content.index(articleCstBegin)
	31	+ try:
	32	+ indexEnd = content.index(articleCstEnd)
	33	+ except:
	34	+ try:
	35	+ indexEnd = content.index(articleCstEnd2)
	36	+ except:
	37	+ indexEnd = content.index(articleCstEnd3)
	38	+ article_only = content[indexBegin:indexEnd]
	39	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	40	+ article_only = re.sub(r"</amp-img>", '', article_only)
	41	+ article_only = re.sub(r"<h2", '<h3', article_only)
	42	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	43	+ article_only = re.sub(r"<h1", '<h2', article_only)
	44	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	45	+ article_only = article_only.replace("><", ">\n<")
	46	+
	47	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.ladepeche.fr/', article_only)
	48	+ pageContent += "<article>"+article_only+"</article>"
	49	+ return pageContent

+47

newsParser/newsParser/newsLeFigaro.py

View

...	...	@@ -0,0 +1,47 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ # ~ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+
	12	+ articleStrImageUrl = newsParser.articleImage(content)
	13	+ articleStrTitle = newsParser.articleTitle(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+
	16	+ pageContent = ""
	17	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	18	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	19	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	20	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	21	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	22	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	23	+
	24	+ articleCstBegin = "<article"
	25	+ articleCstEnd = "<div class=\"related-entries"
	26	+ articleCstEnd2 = "</article>"
	27	+ indexBegin = content.index(articleCstBegin)
	28	+ try:
	29	+ indexEnd = content.index(articleCstEnd)
	30	+ except:
	31	+ indexEnd = content.index(articleCstEnd2)
	32	+ article_only = content[indexBegin:indexEnd]
	33	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	34	+ article_only = re.sub(r"</amp-img>", '', article_only)
	35	+ article_only = re.sub(r"<h2", '<h3', article_only)
	36	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	37	+ article_only = re.sub(r"<h1", '<h2', article_only)
	38	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	39	+ article_only = re.sub(r"<a class=\"share(.?)\" data-social-name=\"(.?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
	40	+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
	41	+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
	42	+ article_only = re.sub(r"<div class=\"(.?) share-bar(.?)>",'<div style="display:none;">', article_only)
	43	+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
	44	+ article_only = article_only.replace("><", ">\n<")
	45	+
	46	+ pageContent += "<article>"+article_only+"</article>"
	47	+ return pageContent

+81

newsParser/newsParser/newsLeMonde.py

View

...	...	@@ -0,0 +1,81 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleImage(content):
	7	+ articleImgBegin ="<meta property=\"og:image\" content=\"http"
	8	+ articleImgEnd ="\">"
	9	+ indexImgBegin = content.index(articleImgBegin)
	10	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	11	+ image = "http"+content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ return image
	13	+
	14	+def articleTitle(content):
	15	+ articleImgBegin ="<meta property=\"og:title\" content=\""
	16	+ articleImgEnd ="\">"
	17	+ indexImgBegin = content.index(articleImgBegin)
	18	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	19	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	20	+ return title
	21	+
	22	+def articleAbonnes(content):
	23	+ articleAbonnes = "article__content--restricted"
	24	+ articleType = ""
	25	+ indexAbonnes = -1
	26	+ try:
	27	+ indexAbonnes = content.index(articleAbonnes)
	28	+ articleType = "Abonnés"
	29	+ except:
	30	+ articleType = ""
	31	+ return articleType
	32	+
	33	+def article(url):
	34	+ say("Article: "+url)
	35	+ # ~ url = url.replace("www.lemonde","abonnes.lemonde")
	36	+ r = requests.get(url, allow_redirects=True)
	37	+ content = r.text
	38	+ # ~ print(content)
	39	+ pageContent = ""
	40	+ articleCstBegin = "<section class=\"article__content"
	41	+ articleCstBegin2 = "<article "
	42	+ articleCstBegin2 = "<article "
	43	+ articleCstEnd = "</article>"
	44	+ try:
	45	+ indexBegin = content.index(articleCstBegin)
	46	+ except:
	47	+ indexBegin = content.index(articleCstBegin2)
	48	+ indexEnd = content.index(articleCstEnd)
	49	+ articleStrImageUrl = newsParser.articleImage(content)
	50	+ articleStrTitle = newsParser.articleTitle(content)
	51	+ articleStrDescription = newsParser.articleDescription(content)
	52	+ articleStrType = articleAbonnes(content)
	53	+
	54	+ pageContent = ""
	55	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	56	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	57	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	58	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	59	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	60	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	61	+
	62	+ article_only = "<h2>"+articleStrTitle+"</h2>\n"
	63	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	64	+ article_only += content[indexBegin:indexEnd]
	65	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	66	+ article_only = re.sub(r"</amp-img>", '', article_only)
	67	+ # ~ newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">"
	68	+ article_only = re.sub(r"<figure class=\"article__media\">(.*?)</figure>",'', article_only)
	69	+
	70	+ article_only = re.sub(r"<img src=\"data(.?)\" data-srcset=\" (.?) 1x,(.?)\"(.?)>","<img src=\"\g<2>\">", article_only)
	71	+ article_only = re.sub(r"</p>", "</p>\n", article_only)
	72	+ article_only = re.sub(r"<h2", '<h3', article_only)
	73	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	74	+ article_only = re.sub(r"<h1", '<h2', article_only)
	75	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	76	+ article_only = article_only.replace("><", ">\n<")
	77	+
	78	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.lemonde.fr/', article_only)
	79	+ pageContent += "<article>"+article_only+"</article>"
	80	+ pageContent += "<p>"+articleStrType+"</p>"
	81	+ return pageContent

+41

newsParser/newsParser/newsLeParisien.py

View

...	...	@@ -0,0 +1,41 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ # ~ url = url.replace("dna.fr/","dna.fr/amp/")
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+
	12	+ articleStrImageUrl = newsParser.articleImage(content)
	13	+ articleStrTitle = newsParser.articleTitle(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+
	16	+ pageContent = ""
	17	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	18	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	19	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	20	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	21	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	22	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	23	+
	24	+ articleCstBegin = "<article "
	25	+ # ~ articleCstEnd = "</article>"
	26	+ articleCstEnd = "<div class=\"article-spacing\">"
	27	+ indexBegin = content.index(articleCstBegin)
	28	+ indexEnd = content.index(articleCstEnd)
	29	+ article_only = content[indexBegin:indexEnd]
	30	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	31	+ article_only = re.sub(r"</amp-img>", '', article_only)
	32	+ article_only = re.sub(r"<h2", '<h3', article_only)
	33	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	34	+ article_only = re.sub(r"<h1", '<h2', article_only)
	35	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	36	+
	37	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.leparisien.fr/', article_only)
	38	+ article_only = re.sub(r"src=\"\/", 'src=\"//www.leparisien.fr/', article_only)
	39	+ article_only = article_only.replace("><", ">\n<")
	40	+ pageContent += "<article>"+article_only+"</article>"
	41	+ return pageContent

+66

newsParser/newsParser/newsLiberation.py

View

...	...	@@ -0,0 +1,66 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleImage(content):
	7	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	8	+ articleImgEnd ="\"/>"
	9	+ indexImgBegin = content.index(articleImgBegin)
	10	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	11	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ return image
	13	+
	14	+def articleTitle(content):
	15	+ articleImgBegin ="<meta property=\"og:title\" content=\""
	16	+ articleImgEnd ="\"/>"
	17	+ indexImgBegin = content.index(articleImgBegin)
	18	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	19	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	20	+ return title
	21	+
	22	+def articleAbonnes(content):
	23	+ articleAbonnes = "réservé aux abonnés"
	24	+ articleType = ""
	25	+ try:
	26	+ indexAbonnes = content.index(articleAbonnes)
	27	+ articleType = "Abonnés"
	28	+ except:
	29	+ articleType = ""
	30	+ return articleType
	31	+
	32	+def article(url):
	33	+ say("Article: "+url)
	34	+ r = requests.get(url, allow_redirects=True)
	35	+ content = r.text
	36	+ articleStrImageUrl = newsParser.articleImage(content)
	37	+ articleStrTitle = newsParser.articleTitle(content)
	38	+ articleStrDescription = newsParser.articleDescription(content)
	39	+ articleStrType = articleAbonnes(content)
	40	+
	41	+ pageContent = ""
	42	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	43	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	44	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	45	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	46	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	47	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	48	+ pageContent += "<h2>"+articleStrTitle+"</h2>\n"
	49	+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
	50	+
	51	+ articleCstBegin = "<article "
	52	+ articleCstEnd = "</article>"
	53	+ indexBegin = content.index(articleCstBegin)
	54	+ indexEnd = content.index(articleCstEnd)
	55	+ article_only = content[indexBegin:indexEnd]
	56	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	57	+ article_only = re.sub(r"</amp-img>", '', article_only)
	58	+ article_only = re.sub(r"<h2", '<h3', article_only)
	59	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	60	+ article_only = re.sub(r"<h1", '<h2', article_only)
	61	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	62	+
	63	+ article_only = re.sub(r"href=\"\/", 'href=\"//wwww.liberation.fr/', article_only)
	64	+ pageContent += article_only
	65	+ pageContent += "<p>"+articleStrType+"</p>"
	66	+ return pageContent

+48

newsParser/newsParser/newsMidiLibre.py

View

...	...	@@ -0,0 +1,48 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ url = url.replace("www.midilibre.fr","www.midilibre.fr/amp")
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+
	12	+ articleStrImageUrl = newsParser.articleImage(content)
	13	+ articleStrTitle = newsParser.articleTitle(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+
	16	+ pageContent = ""
	17	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	18	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	19	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	20	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	21	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	22	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	23	+
	24	+ articleCstBegin = "<article "
	25	+ # ~ articleCstEnd = "</article>"
	26	+ articleCstEnd = "<div class=\"article-full__footer\">"
	27	+ articleCstEnd2 = "<section subscriptions-section=\"content-not-granted\">"
	28	+ articleCstEnd3 = "</article>"
	29	+ indexBegin = content.index(articleCstBegin)
	30	+ try:
	31	+ indexEnd = content.index(articleCstEnd)
	32	+ except:
	33	+ try:
	34	+ indexEnd = content.index(articleCstEnd2)
	35	+ except:
	36	+ indexEnd = content.index(articleCstEnd3)
	37	+ article_only = content[indexBegin:indexEnd]
	38	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	39	+ article_only = re.sub(r"</amp-img>", '', article_only)
	40	+ article_only = re.sub(r"<h2", '<h3', article_only)
	41	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	42	+ article_only = re.sub(r"<h1", '<h2', article_only)
	43	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	44	+ article_only = article_only.replace("><", ">\n<")
	45	+
	46	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.midilibre.fr/', article_only)
	47	+ pageContent += "<article>"+article_only+"</article>"
	48	+ return pageContent

+57

newsParser/newsParser/newsMothershipSG.py

View

...	...	@@ -0,0 +1,57 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ r = requests.get(url, allow_redirects=True)
	9	+ content = r.text
	10	+
	11	+ articleStrImageUrl = newsParser.articleImage(content)
	12	+ articleStrTitle = newsParser.articleTitle(content)
	13	+ articleStrDescription = newsParser.articleDescription(content)
	14	+
	15	+ pageContent = ""
	16	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	17	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	18	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	19	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	20	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	21	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	22	+
	23	+ articleCstBegin = "<div class=\"main-item\" "
	24	+ articleCstEnd2 = "<div class=\"social-share bottom\">"
	25	+ indexBegin = content.index(articleCstBegin)
	26	+ indexEnd = content.index(articleCstEnd2,indexBegin)
	27	+ articleStrImageUrl = newsParser.articleImage(content)
	28	+ articleStrTitle = newsParser.articleTitle(content)
	29	+
	30	+ article_only = "<h2>"+articleStrTitle+"</h2>\n"
	31	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	32	+ article_only += content[indexBegin:indexEnd]
	33	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	34	+ article_only = re.sub(r"</amp-img>", '', article_only)
	35	+ article_only = re.sub(r"<h2", '<h3', article_only)
	36	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	37	+ article_only = re.sub(r"<h1", '<h2', article_only)
	38	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	39	+ newImage="<img src=\""+articleStrImageUrl+"\"><div class=\"article__author-link\">"
	40	+ article_only = re.sub(r"<div class=\"article__author-link\">", newImage, article_only)
	41	+ article_only = re.sub(r"<span class=\"advertisement__title\">Advertisement</span>", '', article_only)
	42	+
	43	+ article_only = re.sub(r"class=\"picture__image lazyload\"", '', article_only)
	44	+ article_only = re.sub(r"<a class=\"addthis_button(.*)</a>", '', article_only)
	45	+ article_only = re.sub(r"<div class=\"c-sharing--default is-article-top-position\"", '<div class="c-sharing--default is-article-top-position" style="display:none"', article_only)
	46	+ article_only = re.sub(r"<h3 class=\"save-for-later__title\">Bookmark</h3>", '', article_only)
	47	+ article_only = re.sub(r"<h2", '<h3', article_only)
	48	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	49	+ article_only = re.sub(r"<h1", '<h2', article_only)
	50	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	51	+ article_only = article_only.replace("><", ">\n<")
	52	+
	53	+ article_only = re.sub(r"href=\"\/", 'href=\"///mothership.sg/', article_only)
	54	+ article_only = re.sub(r"src=\"\/", 'src=\"///mothership.sg/', article_only)
	55	+ article_only = re.sub(r"src='\/", "src='//mothership.sg/", article_only)
	56	+ pageContent += "<article>"+article_only+"</article>"
	57	+ return pageContent

+58

newsParser/newsParser/newsNSTMy.py

View

...	...	@@ -0,0 +1,58 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import json
	5	+import newsParser
	6	+
	7	+def article(url):
	8	+ say("Article: "+url)
	9	+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
	10	+ content = r.text
	11	+
	12	+ articleCstBegin = "<article-component :article=\""
	13	+ articleCstEnd = "\" :nid="
	14	+ indexBegin = content.index(articleCstBegin)
	15	+ indexEnd = content.index(articleCstEnd)
	16	+ article_json = content[indexBegin+len(articleCstBegin):indexEnd]
	17	+ article_json = article_json.replace(""","\"")
	18	+ article_json = article_json.replace("\/","/")
	19	+ article_json = article_json.replace("<","<")
	20	+ article_json = article_json.replace(">",">")
	21	+ jsonArticle = json.loads(article_json)
	22	+
	23	+ article_only = ""
	24	+ articleStrImageUrl = jsonArticle['field_article_images'][0]['url']
	25	+ articleStrImageCaption = jsonArticle['field_article_images'][0]['caption']
	26	+ articleStrTitle = jsonArticle['title']
	27	+
	28	+ articleStrDescription = newsParser.articleDescription(content)
	29	+
	30	+ pageContent = ""
	31	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	32	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	33	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	34	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	35	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	36	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	37	+
	38	+ article_only += "<h2>"+articleStrTitle+"</h2>\n"
	39	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	40	+ if None is not articleStrImageCaption:
	41	+ article_only += "<em>"+articleStrImageCaption+"</em>\n"
	42	+ article_only += jsonArticle['body']
	43	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	44	+ article_only = re.sub(r"</amp-img>", '', article_only)
	45	+ article_only = re.sub(r"<h2", '<h3', article_only)
	46	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	47	+ article_only = re.sub(r"<h1", '<h2', article_only)
	48	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	49	+ article_only = re.sub(r"<a class=\"share(.?)\" data-social-name=\"(.?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
	50	+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
	51	+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
	52	+ article_only = re.sub(r"<div class=\"(.?) share-bar(.?)>",'<div style="display:none;">', article_only)
	53	+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
	54	+ article_only = article_only.replace("><", ">\n<")
	55	+
	56	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only)
	57	+ pageContent += "<article>"+article_only+"</article>"
	58	+ return pageContent

+48

newsParser/newsParser/newsNewYorkTimes.py

View

...	...	@@ -0,0 +1,48 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ r = requests.get(url, allow_redirects=True)
	9	+ content = r.text
	10	+
	11	+ articleStrImageUrl = newsParser.articleImage(content)
	12	+ articleStrTitle = newsParser.articleTitle(content)
	13	+ articleStrDescription = newsParser.articleDescription(content)
	14	+
	15	+ pageContent = ""
	16	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	17	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	18	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	19	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	20	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	21	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	22	+
	23	+ articleCstBegin = "<section name=\"articleBody\""
	24	+ articleCstEnd = "</article>"
	25	+ indexBegin = content.index(articleCstBegin)
	26	+ indexEnd = content.index(articleCstEnd)
	27	+
	28	+ article_only = ""
	29	+ article_only += "<h2>"+articleStrTitle+"</h2>\n"
	30	+ article_only += "<em>"+articleStrDescription+"</em>\n"
	31	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	32	+ article_only += content[indexBegin:indexEnd]
	33	+ article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
	34	+ article_only = re.sub(r"<h2", '<h3', article_only)
	35	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	36	+ article_only = re.sub(r"<h1", '<h2', article_only)
	37	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	38	+ article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
	39	+ # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
	40	+ # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
	41	+ article_only = re.sub(r"<picture><source media=\"(.?)\" srcSet=\"(.?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
	42	+ article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
	43	+ # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only)
	44	+ article_only = article_only.replace("><", ">\n<")
	45	+
	46	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
	47	+ pageContent += "<article>"+article_only+"</article>"
	48	+ return pageContent

+56

newsParser/newsParser/newsNewYorker.py

View

...	...	@@ -0,0 +1,56 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ # ~ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+ # ~ print(content)
	12	+
	13	+ articleStrImageUrl = newsParser.articleImage(content)
	14	+ articleStrTitle = newsParser.articleTitle(content)
	15	+ articleStrDescription = newsParser.articleDescription(content)
	16	+
	17	+ pageContent = ""
	18	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	19	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	20	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	21	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	22	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	23	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	24	+
	25	+ articleCstBegin = "<article"
	26	+ articleCstEnd = "<div class=\"related-entries"
	27	+ articleCstEnd2 = "</article>"
	28	+ indexBegin = content.index(articleCstBegin)
	29	+ try:
	30	+ indexEnd = content.index(articleCstEnd)
	31	+ except:
	32	+ indexEnd = content.index(articleCstEnd2)
	33	+ article_only = content[indexBegin:indexEnd]
	34	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	35	+ article_only = re.sub(r"</amp-img>", '', article_only)
	36	+ article_only = re.sub(r"<h2", '<h3', article_only)
	37	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	38	+ article_only = re.sub(r"<h1", '<h2', article_only)
	39	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	40	+ article_only = re.sub(r"<a class=\"share(.?)\" data-social-name=\"(.?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
	41	+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
	42	+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
	43	+ article_only = re.sub(r"<div class=\"(.?) share-bar(.?)>",'<div style="display:none;">', article_only)
	44	+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
	45	+ article_only = re.sub(r"<div>Save this story for later.</div>",'', article_only)
	46	+ article_only = re.sub(r"<a class=\"sc-(.*?)byline__name-link button",'<a ', article_only)
	47	+ article_only = re.sub(r"<li class=\"social-icons__list-item social-icons__list-item--print social-icons__list-item--standard thinner\">(.*?)</li>",'', article_only,re.MULTILINE)
	48	+ article_only = re.sub(r"<li class=\"social-icons__list-item social-icons__list-item--bookmark social-icons__list-item--standard thinner bookmark-disabled\">(.*?)</li>",'', article_only,re.MULTILINE)
	49	+ article_only = re.sub(r"<ul class=\"social-icons__list\">(.*?)</ul>",'', article_only,re.MULTILINE)
	50	+ article_only = re.sub(r"<aside class=\"sc(.*?)</aside>",'', article_only,re.MULTILINE)
	51	+ article_only = re.sub(r"<noscript>(.*?)</noscript>",'', article_only,re.MULTILINE)
	52	+ article_only = re.sub(r"<svg class=\"icon icon-print\" width=\"17\" height=\"16\" viewBox=\"0 0 17 16\" fill=\"none\" xmlns=\"http://www.w3.org/2000/svg\">",'', article_only)
	53	+ article_only = article_only.replace("><", ">\n<")
	54	+
	55	+ pageContent += "<article>"+article_only+"</article>"
	56	+ return pageContent

+44

newsParser/newsParser/newsNouvelObs.py

View

...	...	@@ -0,0 +1,44 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ url = url.replace(".html",".amp")
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+
	12	+ articleStrImageUrl = newsParser.articleImage(content)
	13	+ articleStrTitle = newsParser.articleTitle(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+
	16	+ pageContent = ""
	17	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	18	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	19	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	20	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	21	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	22	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	23	+
	24	+ articleCstBegin = "<header class=\"article__header\">"
	25	+ articleCstEnd = "<span class=\"article-comments__headline-title\">"
	26	+ articleCstEnd2 = "<div class=\"article-comments__comment-react\">"
	27	+ indexBegin = content.index(articleCstBegin)
	28	+ try:
	29	+ indexEnd = content.index(articleCstEnd)
	30	+ except:
	31	+ indexEnd = content.index(articleCstEnd2)
	32	+
	33	+ article_only = content[indexBegin:indexEnd]
	34	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	35	+ article_only = re.sub(r"</amp-img>", '', article_only)
	36	+ article_only = re.sub(r"<h2", '<h3', article_only)
	37	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	38	+ article_only = re.sub(r"<h1", '<h2', article_only)
	39	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	40	+ article_only = article_only.replace("><", ">\n<")
	41	+
	42	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.nouvelobs.com/', article_only)
	43	+ pageContent += "<article>"+article_only+"</article>"
	44	+ return pageContent

+139

newsParser/newsParser/newsSCMP.py

View

...	...	@@ -0,0 +1,139 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import json
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ r = requests.get(url, allow_redirects=True)
	9	+ content = r.text
	10	+
	11	+ #uuid extraction
	12	+ articleElementBegin ="name=\"cse_uuid\" content=\""
	13	+ articleElementEnd ="\"/>"
	14	+ indexElementBegin = content.index(articleElementBegin)
	15	+ indexElementEnd = content.index(articleElementEnd,indexElementBegin)
	16	+ entityUUID = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
	17	+
	18	+ cstJsonBegin = "window.__APOLLO_STATE__="
	19	+ cstJsonEnd = "</script><script>"
	20	+ indexBegin = content.index(cstJsonBegin)
	21	+ indexBegin += len(cstJsonBegin)
	22	+ indexEnd = content.index(cstJsonEnd)
	23	+ raw_only = content[indexBegin:indexEnd]
	24	+ json_only = json.loads(raw_only)
	25	+
	26	+ with open('data.json', 'w') as f:
	27	+ json.dump(json_only, f)
	28	+
	29	+ applicationId = None
	30	+ json_article = None
	31	+ keyArticle = None
	32	+ for key in json_only["contentService"]["ROOT_QUERY"]:
	33	+ if "\"applicationId\":" in key:
	34	+ keySplit=key.split("\"")
	35	+ applicationId = keySplit[len(keySplit) - 2]
	36	+ keyArticle=json_only["contentService"]["ROOT_QUERY"][key]["id"]
	37	+
	38	+
	39	+
	40	+ json_article=json_only["contentService"][keyArticle]
	41	+ articleStrTitle = json_article["socialHeadline"]
	42	+ articleStrDescription = ""
	43	+
	44	+ for key in json_article["summary"]["json"]:
	45	+ htmlType=key["type"]
	46	+ htmlContent=key["children"][0]["data"]
	47	+ articleStrDescription+="<"+htmlType+">"+htmlContent+"</"+htmlType+">"
	48	+
	49	+ pageContent = ""
	50	+ pageContent += "<meta property=\"og:type\" content=\"article\">"
	51	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">"
	52	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">"
	53	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">"
	54	+ #pageContent += "<meta property=\"og:image\" content=\""+articleStrImage+"\">"
	55	+ pageContent += "<article>"
	56	+ pageContent += "<h2>"+articleStrTitle+"</h2>\n"
	57	+ pageContent += "<em>"+articleStrDescription+"</em>\n"
	58	+
	59	+ # Article Extraction attempt
	60	+ keyArticle2=""
	61	+ for key in json_article:
	62	+ if "body({\"customContents\"" in key:
	63	+ keyArticle2=key
	64	+
	65	+ #say("UUID :"+entityUUID)
	66	+ #say("AppID:"+applicationId)
	67	+ #say("Key :"+keyArticle)
	68	+ #say("Title:"+articleStrTitle)
	69	+ #say("Desc :"+articleStrDescription)
	70	+ json_article2 = json_article[keyArticle2]
	71	+ # ~ with open('data3.json', 'w') as f2:
	72	+ # ~ json.dump(json_article2, f2)
	73	+
	74	+ cpt=0
	75	+ for element in json_article2["json"]:
	76	+ htmlType=element["type"]
	77	+ # ~ print("Bef Element: "+htmlType)
	78	+ if "ad1" in htmlType:
	79	+ continue
	80	+ elif "ad2" in htmlType:
	81	+ continue
	82	+ elif "ad3" in htmlType:
	83	+ continue
	84	+ elif "ad4" in htmlType:
	85	+ continue
	86	+ elif "ad5" in htmlType:
	87	+ continue
	88	+ elif "native-ads" in htmlType:
	89	+ continue
	90	+ elif "more-on-this" in htmlType:
	91	+ continue
	92	+ # ~ print("Aft Element: "+htmlType)
	93	+ try:
	94	+ htmlContent = element["children"]
	95	+ except:
	96	+ continue
	97	+ pageContent += "<"+htmlType+">"
	98	+
	99	+ for elementChildren in htmlContent:
	100	+ htmlTypeChildren=elementChildren["type"]
	101	+ if "text" == htmlTypeChildren:
	102	+ pageContent += elementChildren["data"]
	103	+ elif "a" == htmlTypeChildren:
	104	+ href=elementChildren["attribs"]["href"]
	105	+ pageContent += "<a href=\""+href+"\" target=\"new-"+str(cpt)+"\">"
	106	+ pageContent += elementChildren["children"][0]["data"]
	107	+ pageContent += "</"+htmlTypeChildren+">"
	108	+ elif "img" == htmlTypeChildren:
	109	+ src=elementChildren["attribs"]["src"]
	110	+ caption=elementChildren["attribs"]["title"]
	111	+ pageContent += "<img src=\""+src+"\">"
	112	+ pageContent += "<figcaption><em>"+caption+"</em></figcaption>"
	113	+ try:
	114	+ pageContent += elementChildren["children"][0]["data"]
	115	+ except:
	116	+ pass
	117	+ elif "iframe" == htmlTypeChildren:
	118	+ src=elementChildren["attribs"]["src"]
	119	+ caption=elementChildren["attribs"]["title"]
	120	+ pageContent += "<iframe src=\""+src+"\">"
	121	+ try:
	122	+ pageContent += elementChildren["children"][0]["data"]
	123	+ except:
	124	+ pass
	125	+ pageContent += "</iframe>"
	126	+ pageContent += "<figcaption><em><a href=\""+src+"\" target=\"new-"+str(cpt)+"\">"+caption+"</a></em></figcaption>"
	127	+ elif "em" == htmlTypeChildren:
	128	+ pageContent += "<"+htmlTypeChildren+">"
	129	+ try:
	130	+ pageContent += elementChildren["children"][0]["data"]
	131	+ except:
	132	+ pass
	133	+ pageContent += "</"+htmlTypeChildren+">"
	134	+ else:
	135	+ print("OTHER : "+htmlTypeChildren)
	136	+ pageContent += "</"+htmlType+">\n"
	137	+ cpt+=1
	138	+ pageContent+="</article>"
	139	+ return pageContent

+46

newsParser/newsParser/newsStraitsTimes.py

View

...	...	@@ -0,0 +1,46 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
	9	+ content = r.text
	10	+ pageContent = ""
	11	+ articleCstBegin = "<div class=\"odd field-item\" itemprop=\"articleBody\""
	12	+ articleCstEnd = "<div class=\"token-insert-entity-wrapper-manual pull-left mode-embed_related_story_q\" data-dsnote=\"mchammer\">"
	13	+ indexBegin = content.index(articleCstBegin)
	14	+ indexEnd = content.index(articleCstEnd)
	15	+
	16	+ articleStrImageUrl = newsParser.articleImage(content)
	17	+ articleStrTitle = newsParser.articleTitle(content)
	18	+ articleStrDescription = newsParser.articleDescription(content)
	19	+
	20	+ pageContent = ""
	21	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	22	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	23	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	24	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	25	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	26	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	27	+
	28	+ article_only = "<h2>"+articleStrTitle+"</h2>\n"
	29	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	30	+ article_only += content[indexBegin:indexEnd]
	31	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	32	+ article_only = re.sub(r"</amp-img>", '', article_only)
	33	+ article_only = re.sub(r"<h2", '<h3', article_only)
	34	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	35	+ article_only = re.sub(r"<h1", '<h2', article_only)
	36	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	37	+ article_only = re.sub(r"<a class=\"share(.?)\" data-social-name=\"(.?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
	38	+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
	39	+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
	40	+ article_only = re.sub(r"<div class=\"(.?) share-bar(.?)>",'<div style="display:none;">', article_only)
	41	+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
	42	+ article_only = article_only.replace("><", ">\n<")
	43	+
	44	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only)
	45	+ pageContent += "<article>"+article_only+"</article>"
	46	+ return pageContent

+64

newsParser/newsParser/newsSudOuest.py

View

...	...	@@ -0,0 +1,64 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ url = url.replace("www.midilibre.fr","www.midilibre.fr/amp")
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+ articleStrImageUrl = newsParser.articleImage(content)
	12	+ articleStrTitle = newsParser.articleTitle(content)
	13	+ articleStrDescription = newsParser.articleDescription(content)
	14	+
	15	+ pageContent = ""
	16	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	17	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	18	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	19	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	20	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	21	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	22	+ pageContent += "<h2>"+articleStrTitle+"</h2>\n"
	23	+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
	24	+
	25	+ articleCstBegin = "<article "
	26	+ # ~ articleCstEnd = "</article>"
	27	+ articleCstEnd = "<div class=\"article-full__footer\">"
	28	+ articleCstEnd2 = "<section subscriptions-section=\"content-not-granted\">"
	29	+ articleCstEnd3 = "</article>"
	30	+ indexBegin = content.index(articleCstBegin)
	31	+ try:
	32	+ indexEnd = content.index(articleCstEnd)
	33	+ except:
	34	+ try:
	35	+ indexEnd = content.index(articleCstEnd2)
	36	+ except:
	37	+ indexEnd = content.index(articleCstEnd3)
	38	+ article_only = content[indexBegin:indexEnd]
	39	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	40	+ article_only = re.sub(r"</amp-img>", '', article_only)
	41	+ article_only = re.sub(r"<h2", '<h3', article_only)
	42	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	43	+ article_only = re.sub(r"<h1", '<h2', article_only)
	44	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	45	+ article_only = re.sub(r"<path class=\"icon-base-fill\"(.*?)</path>",'', article_only,re.MULTILINE)
	46	+ article_only = re.sub(r"<path class=\"icon-base-fill\"(.*?)\"/>",'', article_only)
	47	+ article_only = re.sub(r"onclick=\"window.open\("https:(.*?)\);\">",'', article_only)
	48	+ article_only = re.sub(r"<span class=\"text\">S'abonner</span>",'', article_only)
	49	+ article_only = re.sub(r"<div id=\"pub_dfp_inread1\" class=\"pub pub_dfp pub_dfp_inread1 upto-tablet base-margin-bottom pub_with_light_background\"></div>",'', article_only,re.MULTILINE)
	50	+ # ~ article_only = re.sub(r"<svg class=\"icon-share\"(.*?)</svg>",'', article_only,re.MULTILINE)
	51	+ article_only = re.sub(r"href=\"mailto:\?subject=(.*?)\"",'', article_only,re.MULTILINE)
	52	+ article_only = re.sub(r"<svg class=\"(.?)\" viewBox=\"0 0 (.?) (.*?)\">",'<svg>', article_only)
	53	+ article_only = re.sub(r"<svg viewBox=\"0 0 (.?) (.?)\">",'<svg>', article_only)
	54	+ article_only = re.sub(r"class=\"btn btn-icon btn-icon btn-disc btn-pill btn-outline\"",'', article_only)
	55	+ article_only = re.sub(r"class=\"btn btn-icon btn-icon btn-disc btn-pill btn-outline upto-tablet\"",'', article_only)
	56	+ article_only = re.sub(r"<button",'<button style="display:none;">', article_only)
	57	+ article_only = re.sub(r"<aside class=\"social-links",'<aside style="display:none;" class="social-links', article_only)
	58	+ article_only = re.sub(r"onclick=\"if\(navigator\.share\) (.*?)return false;\" >",'', article_only)
	59	+
	60	+ # ~ article_only = article_only.replace("><", ">\n<")
	61	+
	62	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.sudouest.fr/', article_only)
	63	+ pageContent += "<article>"+article_only+"</article>"
	64	+ return pageContent

+69

newsParser/newsParser/newsTelerama.py

View

...	...	@@ -0,0 +1,69 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleImage(content):
	7	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	8	+ articleImgEnd ="\">"
	9	+ indexImgBegin = content.index(articleImgBegin)
	10	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	11	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ return image
	13	+
	14	+def articleTitle(content):
	15	+ articleImgBegin ="<meta property=\"og:title\" content=\""
	16	+ articleImgEnd ="\">"
	17	+ indexImgBegin = content.index(articleImgBegin)
	18	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	19	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	20	+ return title
	21	+
	22	+def articleAbonnes(content):
	23	+ articleAbonnes = "réservé aux abonnés"
	24	+ articleType = ""
	25	+ try:
	26	+ indexAbonnes = content.index(articleAbonnes)
	27	+ articleType = "Abonnés"
	28	+ except:
	29	+ articleType = ""
	30	+ return articleType
	31	+
	32	+def article(url):
	33	+ say("Article: "+url)
	34	+ r = requests.get(url, allow_redirects=True)
	35	+ content = r.text
	36	+ articleStrImageUrl = articleImage(content)
	37	+ articleStrTitle = articleTitle(content)
	38	+ articleStrType = articleAbonnes(content)
	39	+ articleStrDescription = newsParser.articleDescription(content)
	40	+
	41	+ pageContent = ""
	42	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	43	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	44	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	45	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	46	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	47	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	48	+ pageContent += "<h2>"+articleStrTitle+"</h2>\n"
	49	+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
	50	+
	51	+ articleCstBegin = "<article "
	52	+ if articleStrType is "":
	53	+ articleCstEnd = "</article>"
	54	+ else:
	55	+ articleCstEnd = "Cet article est réservé aux abonnés"
	56	+ indexBegin = content.index(articleCstBegin)
	57	+ indexEnd = content.index(articleCstEnd)
	58	+ article_only = content[indexBegin:indexEnd]
	59	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	60	+ article_only = re.sub(r"</amp-img>", '', article_only)
	61	+ article_only = re.sub(r"<h2", '<h3', article_only)
	62	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	63	+ article_only = re.sub(r"<h1", '<h2', article_only)
	64	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	65	+
	66	+ article_only = re.sub(r"href=\"\/", 'href=\"//wwww.telerama.fr/', article_only)
	67	+ pageContent += "<article>"+article_only+"</article>"
	68	+ pageContent += "<p>"+articleStrType+"</p>"
	69	+ return pageContent

+43

newsParser/newsParser/newsTheAtlantic.py

View

...	...	@@ -0,0 +1,43 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ url = url.replace("www.midilibre.fr","www.midilibre.fr/amp")
	9	+ r = requests.get(url, allow_redirects=True)
	10	+ content = r.text
	11	+
	12	+ articleStrImageUrl = newsParser.articleImage(content)
	13	+ articleStrTitle = newsParser.articleTitle(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+
	16	+ pageContent = ""
	17	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	18	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	19	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	20	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	21	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	22	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	23	+
	24	+ articleCstBegin = "<article "
	25	+ articleCstEnd = "</article>"
	26	+ indexBegin = content.index(articleCstBegin)
	27	+ indexEnd = content.index(articleCstEnd,indexBegin)
	28	+ article_only = content[indexBegin:indexEnd]
	29	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	30	+ article_only = re.sub(r"</amp-img>", '', article_only)
	31	+ article_only = re.sub(r"<h2", '<h3', article_only)
	32	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	33	+ article_only = re.sub(r"<h1", '<h2', article_only)
	34	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	35	+ article_only = re.sub(r"<gtp-ad(.*?)></gpt>",'',article_only)
	36	+ article_only = article_only.replace("><", ">\n<")
	37	+ #<ul class="ArticleRecirc_list__3WyEw">
	38	+ article_only = re.sub(r"<h3 class=\"ArticleRecirc_heading__(.*?)\">Recommended Reading</h3>",'',article_only)
	39	+ article_only = re.sub(r"<ul class=\"ArticleRecirc_list__(.*?)\">", '<ul style="display: none;">', article_only,re.MULTILINE)
	40	+ article_only = re.sub(r"<button class=\"ArticleShare_shareButton__(.*?)\" aria-haspopup=\"true\" aria-controls=\"expanded-share-kit\" aria-expanded=\"false\" aria-label=\"Open Share Menu\" data-action=\"click share - expand\">Share</button>", '', article_only)
	41	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.theatlantic.com/', article_only)
	42	+ pageContent += "<article>"+article_only+"</article>"
	43	+ return pageContent

+74

newsParser/newsParser/newsTheGuardian.py

View

...	...	@@ -0,0 +1,74 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleImage(content):
	7	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	8	+ articleImgEnd ="\"/>"
	9	+ # ~ articleImgEnd ="\?width="
	10	+ indexImgBegin = content.index(articleImgBegin)
	11	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	12	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	13	+ return image
	14	+
	15	+def articleTitle(content):
	16	+ articleImgBegin ="<meta property=\"og:title\" content=\""
	17	+ articleImgEnd ="\"/>"
	18	+ indexImgBegin = content.index(articleImgBegin)
	19	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	20	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	21	+ return title
	22	+
	23	+def articleDescription(content):
	24	+ articleImgBegin ="<meta property=\"og:description\" content=\""
	25	+ articleImgEnd ="\"/>"
	26	+ indexImgBegin = content.index(articleImgBegin)
	27	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	28	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	29	+ return title
	30	+
	31	+
	32	+def article(url):
	33	+ say("Article: "+url)
	34	+ # ~ url = url.replace("www.theguardian.com","amp.theguardian.com")
	35	+ r = requests.get(url, allow_redirects=True)
	36	+ content = r.text
	37	+
	38	+ articleCstBegin = "<div class=\"article-body-commercial-selector"
	39	+ articleCstEnd = "<div id=\"slot-body-end\">"
	40	+ indexBegin = content.index(articleCstBegin)
	41	+ indexEnd = content.index(articleCstEnd)
	42	+ articleStrImageUrl = articleImage(content)
	43	+ articleStrTitle = articleTitle(content)
	44	+ articleStrDescription = articleDescription(content)
	45	+
	46	+ pageContent = ""
	47	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	48	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	49	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	50	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	51	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	52	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	53	+
	54	+ article_only = ""
	55	+ article_only += "<h2>"+articleStrTitle+"</h2>\n"
	56	+ article_only += "<em>"+articleStrDescription+"</em>\n"
	57	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	58	+ article_only += content[indexBegin:indexEnd]
	59	+ article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
	60	+ article_only = re.sub(r"<h2", '<h3', article_only)
	61	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	62	+ article_only = re.sub(r"<h1", '<h2', article_only)
	63	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	64	+ article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
	65	+ # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
	66	+ # ~ article_only = re.sub(r"<picture>(.*?)</picture>", '', article_only)
	67	+ article_only = re.sub(r"<picture><source media=\"(.?)\" srcSet=\"(.?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
	68	+ article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
	69	+ # ~ article_only = re.sub(r"<span class=\"(.*?)\">Image</span>",'',article_only)
	70	+ article_only = article_only.replace("><", ">\n<")
	71	+
	72	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
	73	+ pageContent += "<article>"+article_only+"</article>"
	74	+ return pageContent

+63

newsParser/newsParser/newsTheStarMy.py

View

...	...	@@ -0,0 +1,63 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleImage(content):
	7	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	8	+ articleImgEnd ="\" />"
	9	+ indexImgBegin = content.index(articleImgBegin)
	10	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	11	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ return image
	13	+
	14	+def articleTitle(content):
	15	+ articleImgBegin ="<meta property=\"og:title\" content=\""
	16	+ articleImgEnd ="\" />"
	17	+ indexImgBegin = content.index(articleImgBegin)
	18	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	19	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	20	+ return title
	21	+
	22	+def article(url):
	23	+ say("Article: "+url)
	24	+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
	25	+ content = r.text
	26	+ pageContent = ""
	27	+ articleCstBegin = "<article"
	28	+ # ~ articleCstEnd = "</article>"
	29	+ articleCstEnd = "<!-- /Pagination -->"
	30	+ indexBegin = content.index(articleCstBegin)
	31	+ indexEnd = content.index(articleCstEnd)
	32	+
	33	+ articleStrImageUrl = articleImage(content)
	34	+ articleStrTitle = articleTitle(content)
	35	+ articleStrDescription = newsParser.articleDescription(content)
	36	+
	37	+ pageContent = ""
	38	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	39	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	40	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	41	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	42	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	43	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	44	+
	45	+ article_only = "<h2>"+articleStrTitle+"</h2>\n"
	46	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	47	+ article_only += content[indexBegin:indexEnd]
	48	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	49	+ article_only = re.sub(r"</amp-img>", '', article_only)
	50	+ article_only = re.sub(r"<h2", '<h3', article_only)
	51	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	52	+ article_only = re.sub(r"<h1", '<h2', article_only)
	53	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	54	+ article_only = re.sub(r"<a class=\"share(.?)\" data-social-name=\"(.?)\" href=\"(.*?)\" target=\"_blank\">", '<a href="">', article_only)
	55	+ article_only = re.sub(r"<li class=\"(.*?) share-bar__item\">", '<li>', article_only)
	56	+ article_only = re.sub(r"<div class=\"share-bar share-bar--sticky yr-share\">",'<div class="share-bar share-bar--sticky yr-share" style="display:none;">', article_only)
	57	+ article_only = re.sub(r"<div class=\"(.?) share-bar(.?)>",'<div style="display:none;">', article_only)
	58	+ article_only = re.sub(r"<div class=\"yr-share\">",'<div style="display:none;">', article_only)
	59	+ article_only = article_only.replace("><", ">\n<")
	60	+
	61	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.straitstimes.com/', article_only)
	62	+ pageContent += "<article>"+article_only+"</article>"
	63	+ return pageContent

+81

newsParser/newsParser/newsTheVerge.py

View

...	...	@@ -0,0 +1,81 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleImage(content):
	7	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	8	+ articleImgEnd ="\" />"
	9	+ indexImgBegin = content.index(articleImgBegin)
	10	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	11	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ return image
	13	+
	14	+def articleTitle(content):
	15	+ articleImgBegin ="<meta property=\"og:title\" content=\""
	16	+ articleImgEnd ="\" />"
	17	+ indexImgBegin = content.index(articleImgBegin)
	18	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	19	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	20	+ return title
	21	+
	22	+def articleDescription(content):
	23	+ articleImgBegin ="<meta property=\"og:description\" content=\""
	24	+ articleImgEnd ="\" />"
	25	+ indexImgBegin = content.index(articleImgBegin)
	26	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	27	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	28	+ return title
	29	+
	30	+
	31	+def article(url):
	32	+ say("Article: "+url)
	33	+ r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
	34	+ content = r.text
	35	+
	36	+ articleCstBegin = "<div class=\"c-entry-content \">"
	37	+ articleCstBegin2 = "<article"
	38	+ articleCstEnd = "<div class=\"u-hidden-text\" id=\"formatter-datter\""
	39	+ # ~ articleCstEnd = "<section class=\"c-nextclick\">"
	40	+ articleCstEnd2 = "<section class=\"c-related-list\">"
	41	+ articleCstEnd3 = "</article"
	42	+
	43	+ articleStrImageUrl = articleImage(content)
	44	+ articleStrTitle = articleTitle(content)
	45	+ articleStrDescription = articleDescription(content)
	46	+
	47	+ pageContent = ""
	48	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	49	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	50	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	51	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	52	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	53	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	54	+
	55	+ try:
	56	+ indexBegin = content.index(articleCstBegin)
	57	+ except:
	58	+ indexBegin = content.index(articleCstBegin2)
	59	+ try:
	60	+ indexEnd = content.index(articleCstEnd)
	61	+ except:
	62	+ try:
	63	+ indexEnd = content.index(articleCstEnd2)
	64	+ except:
	65	+ indexEnd = content.index(articleCstEnd3)
	66	+ article_only = ""
	67	+ article_only += "<h2>"+articleStrTitle+"</h2>\n"
	68	+ article_only += "<em>"+articleStrDescription+"</em>\n"
	69	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	70	+ article_only += content[indexBegin:indexEnd]
	71	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	72	+ article_only = re.sub(r"</amp-img>", '', article_only)
	73	+ article_only = re.sub(r"<h2", '<h3', article_only)
	74	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	75	+ article_only = re.sub(r"<h1", '<h2', article_only)
	76	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	77	+
	78	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.theverge.com/', article_only)
	79	+ article_only = re.sub(r"src=\"\/", 'src=\"//www.theverge.com/', article_only)
	80	+ pageContent += "<article>"+article_only+"</article>"
	81	+ return pageContent

+67

newsParser/newsParser/newsViceCom.py

View

...	...	@@ -0,0 +1,67 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleImage(content):
	7	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	8	+ articleImgEnd ="\"/>"
	9	+ indexImgBegin = content.index(articleImgBegin)
	10	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	11	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ return image
	13	+
	14	+def articleTitle(content):
	15	+ articleImgBegin ="<meta property=\"og:title\" content=\""
	16	+ articleImgEnd ="\"/>"
	17	+ indexImgBegin = content.index(articleImgBegin)
	18	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	19	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	20	+ return title
	21	+
	22	+
	23	+def article(url):
	24	+ say("Article: "+url)
	25	+ r = requests.get(url, allow_redirects=True)
	26	+ content = r.text
	27	+ pageContent = ""
	28	+ articleCstBegin = "<div class=\"short-form__body\">"
	29	+ articleCstBegin2 = "<div class=\"article__longform__content\">"
	30	+ articleCstEnd = "<div class=\"article__tagged\">"
	31	+ articleCstEnd2 = "<div class=\"article__longform__tags\">"
	32	+ try:
	33	+ indexBegin = content.index(articleCstBegin)
	34	+ except:
	35	+ indexBegin = content.index(articleCstBegin2)
	36	+
	37	+ try:
	38	+ indexEnd = content.index(articleCstEnd)
	39	+ except:
	40	+ indexEnd = content.index(articleCstEnd2)
	41	+
	42	+ # ~ indexEnd = content.index(articleCstEnd)
	43	+ articleStrImageUrl = articleImage(content)
	44	+ articleStrTitle = articleTitle(content)
	45	+ articleStrDescription = newsParser.articleDescription(content)
	46	+
	47	+ pageContent = ""
	48	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	49	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	50	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	51	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	52	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	53	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	54	+
	55	+ article_only = "<h2>"+articleStrTitle+"</h2>\n"
	56	+ article_only += "<img src=\""+articleStrImageUrl+"\">\n"
	57	+ article_only += content[indexBegin:indexEnd]
	58	+ article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
	59	+ article_only = re.sub(r"<h2", '<h3', article_only)
	60	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	61	+ article_only = re.sub(r"<h1", '<h2', article_only)
	62	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	63	+ article_only = article_only.replace("><", ">\n<")
	64	+
	65	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.vice.com/', article_only)
	66	+ pageContent += "<article>"+article_only+"</article>"
	67	+ return pageContent

+68

newsParser/newsParser/newsWaPo.py

View

...	...	@@ -0,0 +1,68 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ r = requests.get(url, allow_redirects=True)
	9	+ r.encoding = r.apparent_encoding
	10	+ content = r.text
	11	+
	12	+ articleStrImageUrl = newsParser.articleImage(content)
	13	+ articleStrTitle = newsParser.articleTitle(content)
	14	+ articleStrDescription = newsParser.articleDescription(content)
	15	+ articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl)
	16	+
	17	+ pageContent = ""
	18	+ pageContent += "<meta charset=\"utf-8\"/>"
	19	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	20	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	21	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	22	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	23	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	24	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	25	+
	26	+ articleCstBegin = "<article"
	27	+ articleCstEnd = "<div class=\"mt-md\">"
	28	+ articleCstEnd2 = "</article>"
	29	+ indexBegin = content.index(articleCstBegin)
	30	+ try:
	31	+ indexEnd = content.index(articleCstEnd)
	32	+ except:
	33	+ indexEnd = content.index(articleCstEnd2)
	34	+
	35	+
	36	+ article_only = "<h2>"+articleStrTitle+"</h2>"
	37	+ article_only = "<img src=\""+articleStrImageUrl+"\">"
	38	+
	39	+ article_only += content[indexBegin:indexEnd]
	40	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	41	+ article_only = re.sub(r"</amp-img>", '', article_only)
	42	+ article_only = re.sub(r"<h2", '<h3', article_only)
	43	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	44	+ article_only = re.sub(r"<h1", '<h2', article_only)
	45	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	46	+ # ~ article_only = re.sub(r"<div data-sc-v=\"4.23.4\" data-sc-c=\"placeholder\">Advertisement</div>", '</h2>', article_only)
	47	+ article_only = re.sub(r"<div data-sc-v=\"4\.24\.3\" data-sc-c=\"placeholder\">Advertisement</div>", '', article_only)
	48	+ article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only)
	49	+ article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only)
	50	+ article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only)
	51	+ article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only)
	52	+ article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only)
	53	+ article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only)
	54	+ article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only)
	55	+ article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)
	56	+ article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)
	57	+ article_only = re.sub(r"<div data-qa=\"drop-cap-letter\">", '<div>', article_only)
	58	+ article_only = re.sub(r"filter:blur\(10px\);", '', article_only)
	59	+ article_only = re.sub(r"<div class=\"bg-pattern-1\".+?>", '<div>', article_only)
	60	+ article_only = re.sub(r"<div class=\"bg-pattern-2\".+?>", '<div>', article_only)
	61	+ article_only = re.sub(r"<img class=\"dn canvas-foreground\" src=\".+?\"/>", '', article_only)
	62	+ article_only = re.sub(r"<div class=\"subhead .+?>", '<div>', article_only)
	63	+ #article_only = re.sub(r"<canvas id=\"artboard\" style=\".+\">", '<canvas>', article_only)
	64	+ #article_only = re.sub(r"", '', article_only)
	65	+ article_only = article_only.replace("><", ">\n<")
	66	+
	67	+ pageContent += "<article>"+article_only+"</article>"
	68	+ return pageContent

+66

newsParser/newsParser/newsYahooCom.py

View

...	...	@@ -0,0 +1,66 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def articleTitle(content):
	7	+ articleImgBegin ="<meta property=\"og:title\" content=\""
	8	+ articleImgEnd ="\" />"
	9	+ indexImgBegin = content.index(articleImgBegin)
	10	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	11	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ return title
	13	+
	14	+def articleImage(content):
	15	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	16	+ articleImgEnd ="\" />"
	17	+ indexImgBegin = content.index(articleImgBegin)
	18	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	19	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	20	+ return title
	21	+
	22	+
	23	+def articleDescription(content):
	24	+ articleImgBegin ="<meta property=\"og:description\" content=\""
	25	+ articleImgEnd ="\" />"
	26	+ indexImgBegin = content.index(articleImgBegin)
	27	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin)
	28	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	29	+ return title
	30	+
	31	+def article(url):
	32	+ say("Article: "+url)
	33	+ url = url.replace("dna.fr/","dna.fr/amp/")
	34	+ r = requests.get(url, allow_redirects=True)
	35	+ content = r.text
	36	+ articleCstBegin = "<div class=\"caas-body\">"
	37	+ articleCstEnd = "</article>"
	38	+ articleStrTitle = articleTitle(content)
	39	+ articleStrImageUrl = articleImage(content)
	40	+ articleStrDescription = articleDescription(content)
	41	+
	42	+ pageContent = ""
	43	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	44	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	45	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	46	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	47	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	48	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	49	+
	50	+ pageContent += "<h2>"+articleStrTitle+"</h2>\n"
	51	+ pageContent += "<em>"+articleStrDescription+"</em>\n"
	52	+ pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
	53	+
	54	+ indexBegin = content.index(articleCstBegin)
	55	+ indexEnd = content.index(articleCstEnd)
	56	+ article_only = ""
	57	+ article_only = content[indexBegin:indexEnd]
	58	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	59	+ article_only = re.sub(r"</amp-img>", '', article_only)
	60	+ article_only = re.sub(r"<h2", '<h3', article_only)
	61	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	62	+ article_only = re.sub(r"<h1", '<h2', article_only)
	63	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	64	+ article_only = re.sub(r"href=\"\/", 'href=\"//news.yahoo.com/', article_only)
	65	+ pageContent += "<article>"+article_only+"</article>"
	66	+ return pageContent

+38

newsParser/newsParser/newsZDNetFr.py

View

...	...	@@ -0,0 +1,38 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+import newsParser
	5	+
	6	+def article(url):
	7	+ say("Article: "+url)
	8	+ r = requests.get(url, allow_redirects=True)
	9	+ content = r.text
	10	+
	11	+ articleStrImageUrl = newsParser.articleImage(content)
	12	+ articleStrTitle = newsParser.articleTitle(content)
	13	+ articleStrDescription = newsParser.articleDescription(content)
	14	+
	15	+ pageContent = ""
	16	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	17	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	18	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	19	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	20	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	21	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	22	+
	23	+ articleCstBegin = "<article "
	24	+ articleCstEnd = "</article>"
	25	+ indexBegin = content.index(articleCstBegin)
	26	+ indexEnd = content.index(articleCstEnd)
	27	+ article_only = content[indexBegin:indexEnd]
	28	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	29	+ article_only = re.sub(r"</amp-img>", '', article_only)
	30	+ article_only = re.sub(r"<h2", '<h3', article_only)
	31	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	32	+ article_only = re.sub(r"<h1", '<h2', article_only)
	33	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	34	+ article_only = article_only.replace("><", ">\n<")
	35	+
	36	+ article_only = re.sub(r"href=\"\/", 'href=\"//www.zdnet.fr/', article_only)
	37	+ pageContent += "<article>"+article_only+"</article>"
	38	+ return pageContent