| ... | ... |
@@ -35,7 +35,6 @@ from .newsParser import newsTheStarMy |
| 35 | 35 |
from .newsParser import newsNSTMy |
| 36 | 36 |
from .newsParser import newsLaDepeche |
| 37 | 37 |
from .newsParser import newsTheGuardian |
| 38 |
-from .newsParser import newsBloomberg |
|
| 39 | 38 |
from .newsParser import newsFranceTVInfo |
| 40 | 39 |
from .newsParser import newsTheVerge |
| 41 | 40 |
from .newsParser import newsBondyBlog |
| ... | ... |
@@ -64,9 +63,9 @@ def articleElement(typeElement,content): |
| 64 | 63 |
element="" |
| 65 | 64 |
if "\"mainEntityOfPage\": \"https://www.buzzfeed" in content: |
| 66 | 65 |
#print("=================== Buzzfeed")
|
| 67 |
- if typeElement is "title": |
|
| 66 |
+ if typeElement == "title": |
|
| 68 | 67 |
articleElementBegin ="\"headline\": \"" |
| 69 |
- elif typeElement is "description": |
|
| 68 |
+ elif typeElement == "description": |
|
| 70 | 69 |
articleElementBegin ="\"description\": \"" |
| 71 | 70 |
articleElementEnd ="\"," |
| 72 | 71 |
indexElementBegin = content.index(articleElementBegin) |
| ... | ... |
@@ -76,12 +75,12 @@ def articleElement(typeElement,content): |
| 76 | 75 |
#print("=================== Lemonde")
|
| 77 | 76 |
articleElementBegin="" |
| 78 | 77 |
articleElementEnd ="\">" |
| 79 |
- if typeElement is "image": |
|
| 78 |
+ if typeElement == "image": |
|
| 80 | 79 |
articleElementBegin ="<meta property=\"og:image\" content=\"http" |
| 81 | 80 |
indexElementBegin = content.index(articleElementBegin) |
| 82 | 81 |
indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
| 83 | 82 |
element = "http"+content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
| 84 |
- elif typeElement is "title": |
|
| 83 |
+ elif typeElement == "title": |
|
| 85 | 84 |
articleElementBegin ="<meta property=\"og:title\" content=\"" |
| 86 | 85 |
indexElementBegin = content.index(articleElementBegin) |
| 87 | 86 |
indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
| ... | ... |
@@ -193,8 +192,6 @@ def getArticle(url): |
| 193 | 192 |
data_page += newsParser.newsLaDepeche.article(url) |
| 194 | 193 |
elif "guardian.com" in url or "guardian.co.uk" in url: |
| 195 | 194 |
data_page += newsParser.newsTheGuardian.article(url) |
| 196 |
- elif "bloomberg.com" in url: |
|
| 197 |
- data_page += newsParser.newsBloomberg.article(url) |
|
| 198 | 195 |
elif "francetvinfo.fr" in url: |
| 199 | 196 |
data_page += newsParser.newsFranceTVInfo.article(url) |
| 200 | 197 |
elif "theverge.com" in url: |
| ... | ... |
@@ -1,53 +0,0 @@ |
| 1 |
-from userio import * |
|
| 2 |
-import requests |
|
| 3 |
-import re |
|
| 4 |
-import newsParser |
|
| 5 |
- |
|
| 6 |
-def article(url): |
|
| 7 |
- say("Article: "+url)
|
|
| 8 |
- r = requests.get(url, allow_redirects=True, headers={'Accept-Encoding': 'deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'})
|
|
| 9 |
- content = r.text |
|
| 10 |
- articleCstBegin = "<div class=\"article-content\">" |
|
| 11 |
- articleCstBegin2 = "<time class=\"article-timestamp\"" |
|
| 12 |
- articleCstEnd = "<div class=\"bottom-left-rail-touts-spacer\">" |
|
| 13 |
- try: |
|
| 14 |
- indexBegin = content.index(articleCstBegin) |
|
| 15 |
- except: |
|
| 16 |
- try: |
|
| 17 |
- indexBegin = content.index(articleCstBegin2) |
|
| 18 |
- except: |
|
| 19 |
- indexBegin = 0 |
|
| 20 |
- try: |
|
| 21 |
- indexEnd = content.index(articleCstEnd) |
|
| 22 |
- except: |
|
| 23 |
- indexEnd = 0 |
|
| 24 |
- articleStrImageUrl = newsParser.articleImage(content) |
|
| 25 |
- articleStrTitle = newsParser.articleTitle(content) |
|
| 26 |
- articleStrDescription = newsParser.articleDescription(content) |
|
| 27 |
- |
|
| 28 |
- pageContent = "" |
|
| 29 |
- pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
| 30 |
- pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
| 31 |
- pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
| 32 |
- pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
| 33 |
- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
| 34 |
- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
| 35 |
- |
|
| 36 |
- article_only = "" |
|
| 37 |
- article_only += "<h2>"+articleStrTitle+"</h2>\n" |
|
| 38 |
- article_only += "<em>"+articleStrDescription+"</em>\n" |
|
| 39 |
- article_only += "<img src=\""+articleStrImageUrl+"\">\n" |
|
| 40 |
- article_only += content[indexBegin:indexEnd] |
|
| 41 |
- article_only = re.sub(r"<div class=\"ac-w-ph__dsc\">Advertisement</div>", '', article_only) |
|
| 42 |
- article_only = re.sub(r"<h2", '<h3', article_only) |
|
| 43 |
- article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
| 44 |
- article_only = re.sub(r"<h1", '<h2', article_only) |
|
| 45 |
- article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
| 46 |
- article_only = re.sub(r"<p>Advertisement</p>", '', article_only) |
|
| 47 |
- article_only = re.sub(r"<picture><source media=\"(.*?)\" srcSet=\"(.*?)\?(.*?)</picture>", "<img src=\"\g<2>\">", article_only) |
|
| 48 |
- article_only = re.sub(r"<a href=\"#after-(.*?)\" style=\"position:absolute;width:1px;height:1px;margin:-1px;padding:0;border:0;clip:rect\(0 0 0 0\);overflow:hidden\">Continue reading the main story</a>", '', article_only) |
|
| 49 |
- article_only = article_only.replace("><", ">\n<")
|
|
| 50 |
- |
|
| 51 |
- article_only = re.sub(r"href=\"\/", 'href=\"//www.nytimes.com/', article_only) |
|
| 52 |
- pageContent += "<article>"+article_only+"</article>" |
|
| 53 |
- return pageContent |