... | ... |
@@ -43,6 +43,8 @@ from .newsParser import newsFrandroidCom |
43 | 43 |
from .newsParser import newsBuzzfeedCom |
44 | 44 |
from .newsParser import newsYahooCom |
45 | 45 |
from .newsParser import newsBFM |
46 |
+from .newsParser import newsDefault |
|
47 |
+from .newsParser import newsLNC |
|
46 | 48 |
# ~ from .newsParser import newsTodayOnlineSG |
47 | 49 |
|
48 | 50 |
def supportedList(): |
... | ... |
@@ -106,14 +108,14 @@ def articleElement(typeElement,content): |
106 | 108 |
except: |
107 | 109 |
indexElementBegin = 0 |
108 | 110 |
try: |
109 |
- print("End Try: "+articleElementEnd) |
|
111 |
+ #print("End Try: "+articleElementEnd) |
|
110 | 112 |
indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
111 | 113 |
except: |
112 | 114 |
try: |
113 |
- print("End Try: "+articleElementEnd2) |
|
115 |
+ #print("End Try: "+articleElementEnd2) |
|
114 | 116 |
indexElementEnd = content.index(articleElementEnd2,indexElementBegin) |
115 | 117 |
except: |
116 |
- print("End Try: "+articleElementEnd3) |
|
118 |
+ #print("End Try: "+articleElementEnd3) |
|
117 | 119 |
indexElementEnd = content.index(articleElementEnd3,indexElementBegin) |
118 | 120 |
element = content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
119 | 121 |
#print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd)) |
... | ... |
@@ -205,9 +207,12 @@ def getArticle(url): |
205 | 207 |
data_page += newsParser.newsYahooCom.article(url) |
206 | 208 |
elif "bfmtv.com" in url: |
207 | 209 |
data_page += newsParser.newsBFM.article(url) |
210 |
+ elif "lnc.nc" in url: |
|
211 |
+ data_page += newsParser.newsLNC.article(url) |
|
208 | 212 |
else: |
209 |
- data_page += "<p>Unsupported News, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n" |
|
210 |
- data_page += "<p>Supported News:" |
|
211 |
- data_page += supportedList() |
|
212 |
- data_page += "</p>\n" |
|
213 |
+ data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n" |
|
214 |
+ #data_page += "<p>Supported News:" |
|
215 |
+ #data_page += supportedList() |
|
216 |
+ #data_page += "</p>\n" |
|
217 |
+ data_page += newsParser.newsDefault.article(url) |
|
213 | 218 |
return data_page |
... | ... |
@@ -0,0 +1,83 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+ |
|
5 |
+def articleImage(content): |
|
6 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
7 |
+ articleImgEnd ="\"" |
|
8 |
+ indexImgBegin = content.index(articleImgBegin) |
|
9 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
10 |
+ try: |
|
11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ except: |
|
13 |
+ image = "favicon.png" |
|
14 |
+ return image |
|
15 |
+ |
|
16 |
+def articleDescription(content): |
|
17 |
+ articleImgBegin ="<meta property=\"og:description\" content=\"" |
|
18 |
+ articleImgEnd ="\"" |
|
19 |
+ indexImgBegin = content.index(articleImgBegin) |
|
20 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
21 |
+ try: |
|
22 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
23 |
+ except: |
|
24 |
+ title = "Description Extraction Failed" |
|
25 |
+ return title |
|
26 |
+ |
|
27 |
+def articleTitle(content): |
|
28 |
+ #articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
29 |
+ articleImgBegin ="\"og:title\" content=\"" |
|
30 |
+ articleImgEnd ="\"" |
|
31 |
+ indexImgBegin = content.index(articleImgBegin) |
|
32 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
33 |
+ try: |
|
34 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
35 |
+ except: |
|
36 |
+ title = "Title Extraction Failed" |
|
37 |
+ return title |
|
38 |
+ |
|
39 |
+def article(url): |
|
40 |
+ say("ArticleDefault: "+url) |
|
41 |
+ r = requests.get(url, allow_redirects=True) |
|
42 |
+ content = r.text |
|
43 |
+ |
|
44 |
+ articleStrImageUrl = articleImage(content) |
|
45 |
+ articleStrTitle = articleTitle(content) |
|
46 |
+ articleStrDescription = articleDescription(content) |
|
47 |
+ |
|
48 |
+ pageContent = "" |
|
49 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
50 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
51 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
52 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
53 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
54 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
55 |
+ |
|
56 |
+ articleCstBegin = "<article" |
|
57 |
+ articleCstEnd = "</article>" |
|
58 |
+ articleCstBegin2 = "<body" |
|
59 |
+ articleCstEnd2 = "</body>" |
|
60 |
+ try: |
|
61 |
+ indexBegin = content.index(articleCstBegin) |
|
62 |
+ except: |
|
63 |
+ try: |
|
64 |
+ indexBegin = content.index(articleCstBegin2) |
|
65 |
+ except: |
|
66 |
+ indexBegin = 0 |
|
67 |
+ try: |
|
68 |
+ indexEnd = content.index(articleCstEnd) |
|
69 |
+ except: |
|
70 |
+ try: |
|
71 |
+ indexEnd = content.index(articleCstEnd2) |
|
72 |
+ except: |
|
73 |
+ indexEnd = strlen(content) |
|
74 |
+ article_only = content[indexBegin:indexEnd] |
|
75 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
76 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
77 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
78 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
79 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
80 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
81 |
+ article_only = article_only.replace("><", ">\n<") |
|
82 |
+ pageContent += "<article>"+article_only+"</article>" |
|
83 |
+ return pageContent |
... | ... |
@@ -0,0 +1,89 @@ |
1 |
+from userio import * |
|
2 |
+import requests |
|
3 |
+import re |
|
4 |
+ |
|
5 |
+def articleImage(content): |
|
6 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
7 |
+ articleImgEnd ="\"" |
|
8 |
+ indexImgBegin = content.index(articleImgBegin) |
|
9 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
10 |
+ try: |
|
11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
12 |
+ except: |
|
13 |
+ image = "favicon.png" |
|
14 |
+ return image |
|
15 |
+ |
|
16 |
+def articleDescription(content): |
|
17 |
+ articleImgBegin ="<meta property=\"og:description\" content=\"" |
|
18 |
+ articleImgEnd ="\"" |
|
19 |
+ indexImgBegin = content.index(articleImgBegin) |
|
20 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
21 |
+ try: |
|
22 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
23 |
+ except: |
|
24 |
+ title = "Description Extraction Failed" |
|
25 |
+ return title |
|
26 |
+ |
|
27 |
+def articleTitle(content): |
|
28 |
+ #articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
29 |
+ articleImgBegin ="\"og:title\" content=\"" |
|
30 |
+ articleImgEnd ="\"" |
|
31 |
+ indexImgBegin = content.index(articleImgBegin) |
|
32 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
33 |
+ try: |
|
34 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
35 |
+ except: |
|
36 |
+ title = "Title Extraction Failed" |
|
37 |
+ return title |
|
38 |
+ |
|
39 |
+def article(url): |
|
40 |
+ say("ArticleDefault: "+url) |
|
41 |
+ r = requests.get(url, allow_redirects=True) |
|
42 |
+ content = r.text |
|
43 |
+ |
|
44 |
+ articleStrImageUrl = articleImage(content) |
|
45 |
+ articleStrTitle = articleTitle(content) |
|
46 |
+ articleStrDescription = articleDescription(content) |
|
47 |
+ |
|
48 |
+ pageContent = "" |
|
49 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
50 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
51 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
52 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
53 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
54 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
55 |
+ |
|
56 |
+ #articleCstBegin = "<article" |
|
57 |
+ #articleCstEnd = "</article>" |
|
58 |
+ articleCstBegin = "<div class=\"middle-main-content\">" |
|
59 |
+ articleCstEnd = "<div id=\"IOSdialog\"" |
|
60 |
+ articleCstBegin2 = "<body" |
|
61 |
+ articleCstEnd2 = "</body>" |
|
62 |
+ try: |
|
63 |
+ indexBegin = content.index(articleCstBegin) |
|
64 |
+ except: |
|
65 |
+ try: |
|
66 |
+ indexBegin = content.index(articleCstBegin2) |
|
67 |
+ except: |
|
68 |
+ indexBegin = 0 |
|
69 |
+ try: |
|
70 |
+ indexEnd = content.index(articleCstEnd) |
|
71 |
+ except: |
|
72 |
+ try: |
|
73 |
+ indexEnd = content.index(articleCstEnd2) |
|
74 |
+ except: |
|
75 |
+ indexEnd = strlen(content) |
|
76 |
+ #<a href="http://l.lnc.nc/changan" target="_blank"> |
|
77 |
+ article_only = content[indexBegin:indexEnd] |
|
78 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
79 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
80 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
81 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
82 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
83 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
84 |
+ article_only = re.sub(r"<a href=\"http://l\.lnc\.nc/changan\" target=\"_blank\"(.+)</a>", '', article_only) |
|
85 |
+ article_only = re.sub(r"<div class=\"col-md col-md-8\">", '<div>', article_only) |
|
86 |
+ article_only = re.sub(r"<div class=\"tm_center_widget\">", '<div>', article_only) |
|
87 |
+ article_only = article_only.replace("><", ">\n<") |
|
88 |
+ pageContent += "<article>"+article_only+"</article>" |
|
89 |
+ return pageContent |
... | ... |
@@ -3,15 +3,38 @@ import requests |
3 | 3 |
import re |
4 | 4 |
import newsParser |
5 | 5 |
|
6 |
+def localArticleTitle(content): |
|
7 |
+ articleElementBegin="<meta property=\"og:title\" content=\"" |
|
8 |
+ articleElementEnd ="\"/>" |
|
9 |
+ indexElementBegin = content.index(articleElementBegin) |
|
10 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
11 |
+ return content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
12 |
+ |
|
13 |
+def localArticleDescription(content): |
|
14 |
+ articleElementBegin="<meta property=\"og:description\" content=\"" |
|
15 |
+ articleElementEnd ="\"/>" |
|
16 |
+ indexElementBegin = content.index(articleElementBegin) |
|
17 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
18 |
+ return content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
19 |
+ return "" |
|
20 |
+ |
|
21 |
+def localArticleImage(content): |
|
22 |
+ articleElementBegin="<meta property=\"og:image\" content=\"" |
|
23 |
+ articleElementEnd ="\"/>" |
|
24 |
+ indexElementBegin = content.index(articleElementBegin) |
|
25 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
26 |
+ return content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
27 |
+ return "" |
|
28 |
+ |
|
6 | 29 |
def article(url): |
7 | 30 |
say("Article: "+url) |
8 | 31 |
r = requests.get(url, allow_redirects=True) |
9 | 32 |
r.encoding = r.apparent_encoding |
10 | 33 |
content = r.text |
11 | 34 |
|
12 |
- articleStrImageUrl = newsParser.articleImage(content) |
|
13 |
- articleStrTitle = newsParser.articleTitle(content) |
|
14 |
- articleStrDescription = newsParser.articleDescription(content) |
|
35 |
+ articleStrImageUrl = localArticleImage(content) |
|
36 |
+ articleStrTitle = localArticleTitle(content) |
|
37 |
+ articleStrDescription = localArticleDescription(content) |
|
15 | 38 |
articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl) |
16 | 39 |
|
17 | 40 |
pageContent = "" |
... | ... |
@@ -26,16 +49,28 @@ def article(url): |
26 | 49 |
articleCstBegin = "<article" |
27 | 50 |
articleCstEnd = "<div class=\"mt-md\">" |
28 | 51 |
articleCstEnd2 = "</article>" |
52 |
+ articleCstEnd3 = "<div class=\"flex mt-md\">" |
|
29 | 53 |
indexBegin = content.index(articleCstBegin) |
30 | 54 |
try: |
31 | 55 |
indexEnd = content.index(articleCstEnd) |
32 | 56 |
except: |
33 |
- indexEnd = content.index(articleCstEnd2) |
|
57 |
+ try: |
|
58 |
+ indexEnd = content.index(articleCstEnd2) |
|
59 |
+ except: |
|
60 |
+ indexEnd = content.index(articleCstEnd3) |
|
61 |
+ debug("indexBegin: "+str(indexBegin)) |
|
62 |
+ debug("indexEnd : "+str(indexEnd)) |
|
63 |
+ say("Title: "+articleStrTitle) |
|
64 |
+ say("Image: "+articleStrImageUrl) |
|
34 | 65 |
|
35 | 66 |
|
36 | 67 |
article_only = "<h2>"+articleStrTitle+"</h2>" |
37 |
- article_only = "<img src=\""+articleStrImageUrl+"\">" |
|
68 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">" |
|
69 |
+ article_only += "<em>"+articleStrDescription+"</em>" |
|
38 | 70 |
|
71 |
+ with open("titi.html", "w") as f2: |
|
72 |
+ f2.write(content[indexBegin:indexEnd]) |
|
73 |
+ f2.close |
|
39 | 74 |
article_only += content[indexBegin:indexEnd] |
40 | 75 |
article_only = re.sub(r"<amp-img", '<img', article_only) |
41 | 76 |
article_only = re.sub(r"</amp-img>", '', article_only) |
... | ... |
@@ -45,11 +80,14 @@ def article(url): |
45 | 80 |
article_only = re.sub(r"</h1>", '</h2>', article_only) |
46 | 81 |
# ~ article_only = re.sub(r"<div data-sc-v=\"4.23.4\" data-sc-c=\"placeholder\">Advertisement</div>", '</h2>', article_only) |
47 | 82 |
article_only = re.sub(r"<div data-sc-v=\"4\.24\.3\" data-sc-c=\"placeholder\">Advertisement</div>", '', article_only) |
48 |
- article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only) |
|
83 |
+ #article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only) |
|
84 |
+ article_only = re.sub(r"<div class=\"dib flex divider.+?data-sc-c=\"adslot\">Story continues below advertisement</div>","", article_only) |
|
85 |
+ |
|
49 | 86 |
article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only) |
50 | 87 |
article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only) |
51 | 88 |
article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only) |
52 | 89 |
article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only) |
90 |
+ article_only = re.sub(r"style=\"min-height:250px\"", 'style=\"min-height:1px\"', article_only) |
|
53 | 91 |
article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only) |
54 | 92 |
article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only) |
55 | 93 |
article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only) |