... | ... |
@@ -35,7 +35,6 @@ from .newsParser import newsTheStarMy |
35 | 35 |
from .newsParser import newsNSTMy |
36 | 36 |
from .newsParser import newsLaDepeche |
37 | 37 |
from .newsParser import newsTheGuardian |
38 |
-from .newsParser import newsBloomberg |
|
39 | 38 |
from .newsParser import newsFranceTVInfo |
40 | 39 |
from .newsParser import newsTheVerge |
41 | 40 |
from .newsParser import newsBondyBlog |
... | ... |
@@ -64,9 +63,9 @@ def articleElement(typeElement,content): |
64 | 63 |
element="" |
65 | 64 |
if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content: |
66 | 65 |
#print("=================== Buzzfeed") |
67 |
- if typeElement is "title": |
|
66 |
+ if typeElement == "title": |
|
68 | 67 |
articleElementBegin ="\"headline\": \"" |
69 |
- elif typeElement is "description": |
|
68 |
+ elif typeElement == "description": |
|
70 | 69 |
articleElementBegin ="\"description\": \"" |
71 | 70 |
articleElementEnd ="\"," |
72 | 71 |
indexElementBegin = content.index(articleElementBegin) |
... | ... |
@@ -76,12 +75,12 @@ def articleElement(typeElement,content): |
76 | 75 |
#print("=================== Lemonde") |
77 | 76 |
articleElementBegin="" |
78 | 77 |
articleElementEnd ="\">" |
79 |
- if typeElement is "image": |
|
78 |
+ if typeElement == "image": |
|
80 | 79 |
articleElementBegin ="<meta property=\"og:image\" content=\"http" |
81 | 80 |
indexElementBegin = content.index(articleElementBegin) |
82 | 81 |
indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
83 | 82 |
element = "http"+content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
84 |
- elif typeElement is "title": |
|
83 |
+ elif typeElement == "title": |
|
85 | 84 |
articleElementBegin ="<meta property=\"og:title\" content=\"" |
86 | 85 |
indexElementBegin = content.index(articleElementBegin) |
87 | 86 |
indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
... | ... |
@@ -193,8 +192,6 @@ def getArticle(url): |
193 | 192 |
data_page += newsParser.newsLaDepeche.article(url) |
194 | 193 |
elif "guardian.com" in url or "guardian.co.uk" in url: |
195 | 194 |
data_page += newsParser.newsTheGuardian.article(url) |
196 |
- elif "bloomberg.com" in url: |
|
197 |
- data_page += newsParser.newsBloomberg.article(url) |
|
198 | 195 |
elif "francetvinfo.fr" in url: |
199 | 196 |
data_page += newsParser.newsFranceTVInfo.article(url) |
200 | 197 |
elif "theverge.com" in url: |
... | ... |
@@ -1,53 +0,0 @@ |
1 |
-from userio import * |
|
2 |
-import requests |
|
3 |
-import re |
|
4 |
-import newsParser |
|
5 |
- |
|
6 |
-def article(url): |
|
7 |
- say("Article: "+url) |
|
8 |
- r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}) |
|
9 |
- content = r.text |
|
10 |
- articleCstBegin = "<div class=\"article-content\">" |
|
11 |
- articleCstBegin2 = "<time class=\"article-timestamp\"" |
|
12 |
- articleCstEnd = "<div class=\"bottom-left-rail-touts-spacer\">" |
|
13 |
- try: |
|
14 |
- indexBegin = content.index(articleCstBegin) |
|
15 |
- except: |
|
16 |
- try: |
|
17 |
- indexBegin = content.index(articleCstBegin2) |
|
18 |
- except: |
|
19 |
- indexBegin = 0 |
|
20 |
- try: |
|
21 |
- indexEnd = content.index(articleCstEnd) |
|
22 |
- except: |
|
23 |
- indexEnd = 0 |
|
24 |
- articleStrImageUrl = newsParser.articleImage(content) |
|
25 |
- articleStrTitle = newsParser.articleTitle(content) |
|
26 |
- articleStrDescription = newsParser.articleDescription(content) |
|
27 |
- |
|
28 |
- pageContent = "" |
|
29 |
- pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
30 |
- pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
31 |
- pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
32 |
- pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
33 |
- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
34 |
- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
35 |
- |
|
36 |
- article_only = "" |
|
37 |
- article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
38 |
- article_only += "<em>"+articleStrDescription+"</em>\n" |
|
39 |
- article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
40 |
- article_only += content[indexBegin:indexEnd] |
|
41 |
- article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only) |
|
42 |
- article_only = re.sub(r"<h2", '<h3', article_only) |
|
43 |
- article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
44 |
- article_only = re.sub(r"<h1", '<h2', article_only) |
|
45 |
- article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
46 |
- article_only = re.sub(r"<p>Advertisement</p>", '', article_only) |
|
47 |
- article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only) |
|
48 |
- article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only) |
|
49 |
- article_only = article_only.replace("><", ">\n<") |
|
50 |
- |
|
51 |
- article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only) |
|
52 |
- pageContent += "<article>"+article_only+"</article>" |
|
53 |
- return pageContent |