Showing 2 changed files with 4 additions and 60 deletions
+4 -7
newsParser/__init__.py
... ...
@@ -35,7 +35,6 @@ from .newsParser import newsTheStarMy
35 35
 from .newsParser import newsNSTMy
36 36
 from .newsParser import newsLaDepeche
37 37
 from .newsParser import newsTheGuardian
38
-from .newsParser import newsBloomberg
39 38
 from .newsParser import newsFranceTVInfo
40 39
 from .newsParser import newsTheVerge
41 40
 from .newsParser import newsBondyBlog
... ...
@@ -64,9 +63,9 @@ def articleElement(typeElement,content):
64 63
   element=""
65 64
   if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content:
66 65
     #print("=================== Buzzfeed")
67
-    if typeElement is "title":
66
+    if typeElement == "title":
68 67
       articleElementBegin ="\"headline\": \""
69
-    elif typeElement is "description":
68
+    elif typeElement == "description":
70 69
       articleElementBegin ="\"description\": \""
71 70
     articleElementEnd   ="\","
72 71
     indexElementBegin = content.index(articleElementBegin)
... ...
@@ -76,12 +75,12 @@ def articleElement(typeElement,content):
76 75
     #print("=================== Lemonde")
77 76
     articleElementBegin=""
78 77
     articleElementEnd   ="\">"
79
-    if typeElement is "image":
78
+    if typeElement == "image":
80 79
       articleElementBegin ="<meta property=\"og:image\" content=\"http"
81 80
       indexElementBegin = content.index(articleElementBegin)
82 81
       indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
83 82
       element = "http"+content[indexElementBegin+len(articleElementBegin):indexElementEnd]
84
-    elif typeElement is "title":
83
+    elif typeElement == "title":
85 84
       articleElementBegin ="<meta property=\"og:title\" content=\""
86 85
       indexElementBegin = content.index(articleElementBegin)
87 86
       indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
... ...
@@ -193,8 +192,6 @@ def getArticle(url):
193 192
       data_page += newsParser.newsLaDepeche.article(url)
194 193
     elif "guardian.com" in url or "guardian.co.uk" in url:
195 194
       data_page += newsParser.newsTheGuardian.article(url)
196
-    elif "bloomberg.com" in url:
197
-      data_page += newsParser.newsBloomberg.article(url)
198 195
     elif "francetvinfo.fr" in url:
199 196
       data_page += newsParser.newsFranceTVInfo.article(url)
200 197
     elif "theverge.com" in url:
-53
newsParser/newsParser/newsBloomberg.py
... ...
@@ -1,53 +0,0 @@
1
-from userio import *
2
-import requests
3
-import re
4
-import newsParser
5
-
6
-def article(url):
7
-  say("Article: "+url)
8
-  r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
9
-  content = r.text
10
-  articleCstBegin = "<div class=\"article-content\">"
11
-  articleCstBegin2 = "<time class=\"article-timestamp\""
12
-  articleCstEnd   = "<div class=\"bottom-left-rail-touts-spacer\">"
13
-  try:
14
-    indexBegin = content.index(articleCstBegin)
15
-  except:
16
-    try:
17
-      indexBegin = content.index(articleCstBegin2)
18
-    except:
19
-      indexBegin = 0
20
-  try:
21
-    indexEnd   = content.index(articleCstEnd)
22
-  except:
23
-    indexEnd   = 0
24
-  articleStrImageUrl = newsParser.articleImage(content)
25
-  articleStrTitle = newsParser.articleTitle(content)
26
-  articleStrDescription = newsParser.articleDescription(content)
27
-  
28
-  pageContent = ""
29
-  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
30
-  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
31
-  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
32
-  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
33
-  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
34
-  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
35
-  
36
-  article_only = ""
37
-  article_only += "<h2>"+articleStrTitle+"</h2>\n"
38
-  article_only += "<em>"+articleStrDescription+"</em>\n"
39
-  article_only += "<img src=\""+articleStrImageUrl+"\">\n"
40
-  article_only += content[indexBegin:indexEnd]
41
-  article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only)
42
-  article_only = re.sub(r"<h2", '<h3', article_only)
43
-  article_only = re.sub(r"</h2>", '</h3>', article_only)
44
-  article_only = re.sub(r"<h1", '<h2', article_only)
45
-  article_only = re.sub(r"</h1>", '</h2>', article_only)
46
-  article_only = re.sub(r"<p>Advertisement</p>", '', article_only)
47
-  article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only)
48
-  article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only)
49
-  article_only = article_only.replace("><", ">\n<")
50
-  
51
-  article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only)
52
-  pageContent += "<article>"+article_only+"</article>"
53
-  return pageContent