Removed Bloomberg ・ 53589c5 ・ Gitprep

- Removed Bloomberg;
- Browse files
- ycawidro commited on 2022-07-29
- 1 parent 283cedd
  
  commit 53589c5eea83ef6e70b4b8283a209186a4dd0065

Showing 2 changed files with 4 additions and 60 deletions

+4 -7

newsParser/__init__.py

@@ -35,7 +35,6 @@ from .newsParser import newsTheStarMy
 from .newsParser import newsNSTMy
 from .newsParser import newsLaDepeche
 from .newsParser import newsTheGuardian
-from .newsParser import newsBloomberg
 from .newsParser import newsFranceTVInfo
 from .newsParser import newsTheVerge
 from .newsParser import newsBondyBlog
@@ -64,9 +63,9 @@ def articleElement(typeElement,content):
   element=""
   if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content:
     #print("=================== Buzzfeed")
-    if typeElement is "title":
+    if typeElement == "title":
       articleElementBegin ="\"headline\": \""
-    elif typeElement is "description":
+    elif typeElement == "description":
       articleElementBegin ="\"description\": \""
     articleElementEnd   ="\","
     indexElementBegin = content.index(articleElementBegin)
@@ -76,12 +75,12 @@ def articleElement(typeElement,content):
     #print("=================== Lemonde")
     articleElementBegin=""
     articleElementEnd   ="\">"
-    if typeElement is "image":
+    if typeElement == "image":
       articleElementBegin ="<meta property=\"og:image\" content=\"http"
       indexElementBegin = content.index(articleElementBegin)
       indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
       element = "http"+content[indexElementBegin+len(articleElementBegin):indexElementEnd]
-    elif typeElement is "title":
+    elif typeElement == "title":
       articleElementBegin ="<meta property=\"og:title\" content=\""
       indexElementBegin = content.index(articleElementBegin)
       indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
@@ -193,8 +192,6 @@ def getArticle(url):
       data_page += newsParser.newsLaDepeche.article(url)
     elif "guardian.com" in url or "guardian.co.uk" in url:
       data_page += newsParser.newsTheGuardian.article(url)
-    elif "bloomberg.com" in url:
-      data_page += newsParser.newsBloomberg.article(url)
     elif "francetvinfo.fr" in url:
       data_page += newsParser.newsFranceTVInfo.article(url)
     elif "theverge.com" in url:

-53

newsParser/newsParser/newsBloomberg.py

View

@@ -1,53 +0,0 @@
-from userio import *
-import requests
-import re
-import newsParser
-
-def article(url):
-  say("Article: "+url)
-  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
-  content = r.text
-  articleCstBegin = "<div class=\"article-content\">"
-  articleCstBegin2 = "<time class=\"article-timestamp\""
-  articleCstEnd   = "<div class=\"bottom-left-rail-touts-spacer\">"
-  try:
-    indexBegin = content.index(articleCstBegin)
-  except:
-    try:
-      indexBegin = content.index(articleCstBegin2)
-    except:
-      indexBegin = 0
-  try:
-    indexEnd   = content.index(articleCstEnd)
-  except:
-    indexEnd   = 0
-  articleStrImageUrl = newsParser.articleImage(content)
-  articleStrTitle = newsParser.articleTitle(content)
-  articleStrDescription = newsParser.articleDescription(content)
-  
-  pageContent = ""
-  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
-  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
-  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
-  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
-  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
-  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
-  
-  article_only = ""
-  article_only += "<h2>"+articleStrTitle+"</h2>\n"
-  article_only += "<em>"+articleStrDescription+"</em>\n"
-  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
-  article_only += content[indexBegin:indexEnd]
-  article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
-  article_only = re.sub(r"<h2", '<h3', article_only)
-  article_only = re.sub(r"</h2>", '</h3>', article_only)
-  article_only = re.sub(r"<h1", '<h2', article_only)
-  article_only = re.sub(r"</h1>", '</h2>', article_only)
-  article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
-  article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
-  article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
-  article_only = article_only.replace("><", ">\n<")
-  
-  article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
-  pageContent += "<article>"+article_only+"</article>"
-  return pageContent

...	...	@@ -35,7 +35,6 @@ from .newsParser import newsTheStarMy
35	35	from .newsParser import newsNSTMy
36	36	from .newsParser import newsLaDepeche
37	37	from .newsParser import newsTheGuardian
38		-from .newsParser import newsBloomberg
39	38	from .newsParser import newsFranceTVInfo
40	39	from .newsParser import newsTheVerge
41	40	from .newsParser import newsBondyBlog
...	...	@@ -64,9 +63,9 @@ def articleElement(typeElement,content):
64	63	element=""
65	64	if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content:
66	65	#print("=================== Buzzfeed")
67		- if typeElement is "title":
	66	+ if typeElement == "title":
68	67	articleElementBegin ="\"headline\": \""
69		- elif typeElement is "description":
	68	+ elif typeElement == "description":
70	69	articleElementBegin ="\"description\": \""
71	70	articleElementEnd ="\","
72	71	indexElementBegin = content.index(articleElementBegin)
...	...	@@ -76,12 +75,12 @@ def articleElement(typeElement,content):
76	75	#print("=================== Lemonde")
77	76	articleElementBegin=""
78	77	articleElementEnd ="\">"
79		- if typeElement is "image":
	78	+ if typeElement == "image":
80	79	articleElementBegin ="<meta property=\"og:image\" content=\"http"
81	80	indexElementBegin = content.index(articleElementBegin)
82	81	indexElementEnd = content.index(articleElementEnd,indexElementBegin)
83	82	element = "http"+content[indexElementBegin+len(articleElementBegin):indexElementEnd]
84		- elif typeElement is "title":
	83	+ elif typeElement == "title":
85	84	articleElementBegin ="<meta property=\"og:title\" content=\""
86	85	indexElementBegin = content.index(articleElementBegin)
87	86	indexElementEnd = content.index(articleElementEnd,indexElementBegin)
...	...	@@ -193,8 +192,6 @@ def getArticle(url):
193	192	data_page += newsParser.newsLaDepeche.article(url)
194	193	elif "guardian.com" in url or "guardian.co.uk" in url:
195	194	data_page += newsParser.newsTheGuardian.article(url)
196		- elif "bloomberg.com" in url:
197		- data_page += newsParser.newsBloomberg.article(url)
198	195	elif "francetvinfo.fr" in url:
199	196	data_page += newsParser.newsFranceTVInfo.article(url)
200	197	elif "theverge.com" in url:

...	...	@@ -1,53 +0,0 @@
1		-from userio import *
2		-import requests
3		-import re
4		-import newsParser
5		-
6		-def article(url):
7		- say("Article: "+url)
8		- r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
9		- content = r.text
10		- articleCstBegin = "<div class=\"article-content\">"
11		- articleCstBegin2 = "<time class=\"article-timestamp\""
12		- articleCstEnd = "<div class=\"bottom-left-rail-touts-spacer\">"
13		- try:
14		- indexBegin = content.index(articleCstBegin)
15		- except:
16		- try:
17		- indexBegin = content.index(articleCstBegin2)
18		- except:
19		- indexBegin = 0
20		- try:
21		- indexEnd = content.index(articleCstEnd)
22		- except:
23		- indexEnd = 0
24		- articleStrImageUrl = newsParser.articleImage(content)
25		- articleStrTitle = newsParser.articleTitle(content)
26		- articleStrDescription = newsParser.articleDescription(content)
27		-
28		- pageContent = ""
29		- pageContent += "<meta property=\"og:type\" content=\"article\">\n"
30		- pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
31		- pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
32		- pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
33		- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
34		- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
35		-
36		- article_only = ""
37		- article_only += "<h2>"+articleStrTitle+"</h2>\n"
38		- article_only += "<em>"+articleStrDescription+"</em>\n"
39		- article_only += "<img src=\""+articleStrImageUrl+"\">\n"
40		- article_only += content[indexBegin:indexEnd]
41		- article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
42		- article_only = re.sub(r"<h2", '<h3', article_only)
43		- article_only = re.sub(r"</h2>", '</h3>', article_only)
44		- article_only = re.sub(r"<h1", '<h2', article_only)
45		- article_only = re.sub(r"</h1>", '</h2>', article_only)
46		- article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
47		- article_only = re.sub(r"<picture><source media=\"(.?)\" srcSet=\"(.?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
48		- article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
49		- article_only = article_only.replace("><", ">\n<")
50		-
51		- article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
52		- pageContent += "<article>"+article_only+"</article>"
53		- return pageContent