| ... | ... |
@@ -43,6 +43,8 @@ from .newsParser import newsFrandroidCom |
| 43 | 43 |
from .newsParser import newsBuzzfeedCom |
| 44 | 44 |
from .newsParser import newsYahooCom |
| 45 | 45 |
from .newsParser import newsBFM |
| 46 |
+from .newsParser import newsDefault |
|
| 47 |
+from .newsParser import newsLNC |
|
| 46 | 48 |
# ~ from .newsParser import newsTodayOnlineSG |
| 47 | 49 |
|
| 48 | 50 |
def supportedList(): |
| ... | ... |
@@ -106,14 +108,14 @@ def articleElement(typeElement,content): |
| 106 | 108 |
except: |
| 107 | 109 |
indexElementBegin = 0 |
| 108 | 110 |
try: |
| 109 |
- print("End Try: "+articleElementEnd)
|
|
| 111 |
+ #print("End Try: "+articleElementEnd)
|
|
| 110 | 112 |
indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
| 111 | 113 |
except: |
| 112 | 114 |
try: |
| 113 |
- print("End Try: "+articleElementEnd2)
|
|
| 115 |
+ #print("End Try: "+articleElementEnd2)
|
|
| 114 | 116 |
indexElementEnd = content.index(articleElementEnd2,indexElementBegin) |
| 115 | 117 |
except: |
| 116 |
- print("End Try: "+articleElementEnd3)
|
|
| 118 |
+ #print("End Try: "+articleElementEnd3)
|
|
| 117 | 119 |
indexElementEnd = content.index(articleElementEnd3,indexElementBegin) |
| 118 | 120 |
element = content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
| 119 | 121 |
#print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd))
|
| ... | ... |
@@ -205,9 +207,12 @@ def getArticle(url): |
| 205 | 207 |
data_page += newsParser.newsYahooCom.article(url) |
| 206 | 208 |
elif "bfmtv.com" in url: |
| 207 | 209 |
data_page += newsParser.newsBFM.article(url) |
| 210 |
+ elif "lnc.nc" in url: |
|
| 211 |
+ data_page += newsParser.newsLNC.article(url) |
|
| 208 | 212 |
else: |
| 209 |
- data_page += "<p>Unsupported News, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n" |
|
| 210 |
- data_page += "<p>Supported News:" |
|
| 211 |
- data_page += supportedList() |
|
| 212 |
- data_page += "</p>\n" |
|
| 213 |
+ data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n" |
|
| 214 |
+ #data_page += "<p>Supported News:" |
|
| 215 |
+ #data_page += supportedList() |
|
| 216 |
+ #data_page += "</p>\n" |
|
| 217 |
+ data_page += newsParser.newsDefault.article(url) |
|
| 213 | 218 |
return data_page |
| ... | ... |
@@ -0,0 +1,83 @@ |
| 1 |
+from userio import * |
|
| 2 |
+import requests |
|
| 3 |
+import re |
|
| 4 |
+ |
|
| 5 |
+def articleImage(content): |
|
| 6 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
| 7 |
+ articleImgEnd ="\"" |
|
| 8 |
+ indexImgBegin = content.index(articleImgBegin) |
|
| 9 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 10 |
+ try: |
|
| 11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
| 12 |
+ except: |
|
| 13 |
+ image = "favicon.png" |
|
| 14 |
+ return image |
|
| 15 |
+ |
|
| 16 |
+def articleDescription(content): |
|
| 17 |
+ articleImgBegin ="<meta property=\"og:description\" content=\"" |
|
| 18 |
+ articleImgEnd ="\"" |
|
| 19 |
+ indexImgBegin = content.index(articleImgBegin) |
|
| 20 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 21 |
+ try: |
|
| 22 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
| 23 |
+ except: |
|
| 24 |
+ title = "Description Extraction Failed" |
|
| 25 |
+ return title |
|
| 26 |
+ |
|
| 27 |
+def articleTitle(content): |
|
| 28 |
+ #articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
| 29 |
+ articleImgBegin ="\"og:title\" content=\"" |
|
| 30 |
+ articleImgEnd ="\"" |
|
| 31 |
+ indexImgBegin = content.index(articleImgBegin) |
|
| 32 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 33 |
+ try: |
|
| 34 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
| 35 |
+ except: |
|
| 36 |
+ title = "Title Extraction Failed" |
|
| 37 |
+ return title |
|
| 38 |
+ |
|
| 39 |
+def article(url): |
|
| 40 |
+ say("ArticleDefault: "+url)
|
|
| 41 |
+ r = requests.get(url, allow_redirects=True) |
|
| 42 |
+ content = r.text |
|
| 43 |
+ |
|
| 44 |
+ articleStrImageUrl = articleImage(content) |
|
| 45 |
+ articleStrTitle = articleTitle(content) |
|
| 46 |
+ articleStrDescription = articleDescription(content) |
|
| 47 |
+ |
|
| 48 |
+ pageContent = "" |
|
| 49 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
| 50 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
| 51 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
| 52 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
| 53 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
| 54 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
| 55 |
+ |
|
| 56 |
+ articleCstBegin = "<article" |
|
| 57 |
+ articleCstEnd = "</article>" |
|
| 58 |
+ articleCstBegin2 = "<body" |
|
| 59 |
+ articleCstEnd2 = "</body>" |
|
| 60 |
+ try: |
|
| 61 |
+ indexBegin = content.index(articleCstBegin) |
|
| 62 |
+ except: |
|
| 63 |
+ try: |
|
| 64 |
+ indexBegin = content.index(articleCstBegin2) |
|
| 65 |
+ except: |
|
| 66 |
+ indexBegin = 0 |
|
| 67 |
+ try: |
|
| 68 |
+ indexEnd = content.index(articleCstEnd) |
|
| 69 |
+ except: |
|
| 70 |
+ try: |
|
| 71 |
+ indexEnd = content.index(articleCstEnd2) |
|
| 72 |
+ except: |
|
| 73 |
+ indexEnd = strlen(content) |
|
| 74 |
+ article_only = content[indexBegin:indexEnd] |
|
| 75 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
| 76 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
| 77 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
| 78 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
| 79 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
| 80 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
| 81 |
+ article_only = article_only.replace("><", ">\n<")
|
|
| 82 |
+ pageContent += "<article>"+article_only+"</article>" |
|
| 83 |
+ return pageContent |
| ... | ... |
@@ -0,0 +1,89 @@ |
| 1 |
+from userio import * |
|
| 2 |
+import requests |
|
| 3 |
+import re |
|
| 4 |
+ |
|
| 5 |
+def articleImage(content): |
|
| 6 |
+ articleImgBegin ="<meta property=\"og:image\" content=\"" |
|
| 7 |
+ articleImgEnd ="\"" |
|
| 8 |
+ indexImgBegin = content.index(articleImgBegin) |
|
| 9 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 10 |
+ try: |
|
| 11 |
+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
| 12 |
+ except: |
|
| 13 |
+ image = "favicon.png" |
|
| 14 |
+ return image |
|
| 15 |
+ |
|
| 16 |
+def articleDescription(content): |
|
| 17 |
+ articleImgBegin ="<meta property=\"og:description\" content=\"" |
|
| 18 |
+ articleImgEnd ="\"" |
|
| 19 |
+ indexImgBegin = content.index(articleImgBegin) |
|
| 20 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 21 |
+ try: |
|
| 22 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
| 23 |
+ except: |
|
| 24 |
+ title = "Description Extraction Failed" |
|
| 25 |
+ return title |
|
| 26 |
+ |
|
| 27 |
+def articleTitle(content): |
|
| 28 |
+ #articleImgBegin ="<meta property=\"og:title\" content=\"" |
|
| 29 |
+ articleImgBegin ="\"og:title\" content=\"" |
|
| 30 |
+ articleImgEnd ="\"" |
|
| 31 |
+ indexImgBegin = content.index(articleImgBegin) |
|
| 32 |
+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin)) |
|
| 33 |
+ try: |
|
| 34 |
+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd] |
|
| 35 |
+ except: |
|
| 36 |
+ title = "Title Extraction Failed" |
|
| 37 |
+ return title |
|
| 38 |
+ |
|
| 39 |
+def article(url): |
|
| 40 |
+ say("ArticleDefault: "+url)
|
|
| 41 |
+ r = requests.get(url, allow_redirects=True) |
|
| 42 |
+ content = r.text |
|
| 43 |
+ |
|
| 44 |
+ articleStrImageUrl = articleImage(content) |
|
| 45 |
+ articleStrTitle = articleTitle(content) |
|
| 46 |
+ articleStrDescription = articleDescription(content) |
|
| 47 |
+ |
|
| 48 |
+ pageContent = "" |
|
| 49 |
+ pageContent += "<meta property=\"og:type\" content=\"article\">\n" |
|
| 50 |
+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n" |
|
| 51 |
+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n" |
|
| 52 |
+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n" |
|
| 53 |
+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n" |
|
| 54 |
+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">" |
|
| 55 |
+ |
|
| 56 |
+ #articleCstBegin = "<article" |
|
| 57 |
+ #articleCstEnd = "</article>" |
|
| 58 |
+ articleCstBegin = "<div class=\"middle-main-content\">" |
|
| 59 |
+ articleCstEnd = "<div id=\"IOSdialog\"" |
|
| 60 |
+ articleCstBegin2 = "<body" |
|
| 61 |
+ articleCstEnd2 = "</body>" |
|
| 62 |
+ try: |
|
| 63 |
+ indexBegin = content.index(articleCstBegin) |
|
| 64 |
+ except: |
|
| 65 |
+ try: |
|
| 66 |
+ indexBegin = content.index(articleCstBegin2) |
|
| 67 |
+ except: |
|
| 68 |
+ indexBegin = 0 |
|
| 69 |
+ try: |
|
| 70 |
+ indexEnd = content.index(articleCstEnd) |
|
| 71 |
+ except: |
|
| 72 |
+ try: |
|
| 73 |
+ indexEnd = content.index(articleCstEnd2) |
|
| 74 |
+ except: |
|
| 75 |
+ indexEnd = strlen(content) |
|
| 76 |
+ #<a href="http://l.lnc.nc/changan" target="_blank"> |
|
| 77 |
+ article_only = content[indexBegin:indexEnd] |
|
| 78 |
+ article_only = re.sub(r"<amp-img", '<img', article_only) |
|
| 79 |
+ article_only = re.sub(r"</amp-img>", '', article_only) |
|
| 80 |
+ article_only = re.sub(r"<h2", '<h3', article_only) |
|
| 81 |
+ article_only = re.sub(r"</h2>", '</h3>', article_only) |
|
| 82 |
+ article_only = re.sub(r"<h1", '<h2', article_only) |
|
| 83 |
+ article_only = re.sub(r"</h1>", '</h2>', article_only) |
|
| 84 |
+ article_only = re.sub(r"<a href=\"http://l\.lnc\.nc/changan\" target=\"_blank\"(.+)</a>", '', article_only) |
|
| 85 |
+ article_only = re.sub(r"<div class=\"col-md col-md-8\">", '<div>', article_only) |
|
| 86 |
+ article_only = re.sub(r"<div class=\"tm_center_widget\">", '<div>', article_only) |
|
| 87 |
+ article_only = article_only.replace("><", ">\n<")
|
|
| 88 |
+ pageContent += "<article>"+article_only+"</article>" |
|
| 89 |
+ return pageContent |
| ... | ... |
@@ -3,15 +3,38 @@ import requests |
| 3 | 3 |
import re |
| 4 | 4 |
import newsParser |
| 5 | 5 |
|
| 6 |
+def localArticleTitle(content): |
|
| 7 |
+ articleElementBegin="<meta property=\"og:title\" content=\"" |
|
| 8 |
+ articleElementEnd ="\"/>" |
|
| 9 |
+ indexElementBegin = content.index(articleElementBegin) |
|
| 10 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
| 11 |
+ return content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
| 12 |
+ |
|
| 13 |
+def localArticleDescription(content): |
|
| 14 |
+ articleElementBegin="<meta property=\"og:description\" content=\"" |
|
| 15 |
+ articleElementEnd ="\"/>" |
|
| 16 |
+ indexElementBegin = content.index(articleElementBegin) |
|
| 17 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
| 18 |
+ return content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
| 19 |
+ return "" |
|
| 20 |
+ |
|
| 21 |
+def localArticleImage(content): |
|
| 22 |
+ articleElementBegin="<meta property=\"og:image\" content=\"" |
|
| 23 |
+ articleElementEnd ="\"/>" |
|
| 24 |
+ indexElementBegin = content.index(articleElementBegin) |
|
| 25 |
+ indexElementEnd = content.index(articleElementEnd,indexElementBegin) |
|
| 26 |
+ return content[indexElementBegin+len(articleElementBegin):indexElementEnd] |
|
| 27 |
+ return "" |
|
| 28 |
+ |
|
| 6 | 29 |
def article(url): |
| 7 | 30 |
say("Article: "+url)
|
| 8 | 31 |
r = requests.get(url, allow_redirects=True) |
| 9 | 32 |
r.encoding = r.apparent_encoding |
| 10 | 33 |
content = r.text |
| 11 | 34 |
|
| 12 |
- articleStrImageUrl = newsParser.articleImage(content) |
|
| 13 |
- articleStrTitle = newsParser.articleTitle(content) |
|
| 14 |
- articleStrDescription = newsParser.articleDescription(content) |
|
| 35 |
+ articleStrImageUrl = localArticleImage(content) |
|
| 36 |
+ articleStrTitle = localArticleTitle(content) |
|
| 37 |
+ articleStrDescription = localArticleDescription(content) |
|
| 15 | 38 |
articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl) |
| 16 | 39 |
|
| 17 | 40 |
pageContent = "" |
| ... | ... |
@@ -26,16 +49,28 @@ def article(url): |
| 26 | 49 |
articleCstBegin = "<article" |
| 27 | 50 |
articleCstEnd = "<div class=\"mt-md\">" |
| 28 | 51 |
articleCstEnd2 = "</article>" |
| 52 |
+ articleCstEnd3 = "<div class=\"flex mt-md\">" |
|
| 29 | 53 |
indexBegin = content.index(articleCstBegin) |
| 30 | 54 |
try: |
| 31 | 55 |
indexEnd = content.index(articleCstEnd) |
| 32 | 56 |
except: |
| 33 |
- indexEnd = content.index(articleCstEnd2) |
|
| 57 |
+ try: |
|
| 58 |
+ indexEnd = content.index(articleCstEnd2) |
|
| 59 |
+ except: |
|
| 60 |
+ indexEnd = content.index(articleCstEnd3) |
|
| 61 |
+ debug("indexBegin: "+str(indexBegin))
|
|
| 62 |
+ debug("indexEnd : "+str(indexEnd))
|
|
| 63 |
+ say("Title: "+articleStrTitle)
|
|
| 64 |
+ say("Image: "+articleStrImageUrl)
|
|
| 34 | 65 |
|
| 35 | 66 |
|
| 36 | 67 |
article_only = "<h2>"+articleStrTitle+"</h2>" |
| 37 |
- article_only = "<img src=\""+articleStrImageUrl+"\">" |
|
| 68 |
+ article_only += "<img src=\""+articleStrImageUrl+"\">" |
|
| 69 |
+ article_only += "<em>"+articleStrDescription+"</em>" |
|
| 38 | 70 |
|
| 71 |
+ with open("titi.html", "w") as f2:
|
|
| 72 |
+ f2.write(content[indexBegin:indexEnd]) |
|
| 73 |
+ f2.close |
|
| 39 | 74 |
article_only += content[indexBegin:indexEnd] |
| 40 | 75 |
article_only = re.sub(r"<amp-img", '<img', article_only) |
| 41 | 76 |
article_only = re.sub(r"</amp-img>", '', article_only) |
| ... | ... |
@@ -45,11 +80,14 @@ def article(url): |
| 45 | 80 |
article_only = re.sub(r"</h1>", '</h2>', article_only) |
| 46 | 81 |
# ~ article_only = re.sub(r"<div data-sc-v=\"4.23.4\" data-sc-c=\"placeholder\">Advertisement</div>", '</h2>', article_only) |
| 47 | 82 |
article_only = re.sub(r"<div data-sc-v=\"4\.24\.3\" data-sc-c=\"placeholder\">Advertisement</div>", '', article_only) |
| 48 |
- article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only) |
|
| 83 |
+ #article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only) |
|
| 84 |
+ article_only = re.sub(r"<div class=\"dib flex divider.+?data-sc-c=\"adslot\">Story continues below advertisement</div>","", article_only) |
|
| 85 |
+ |
|
| 49 | 86 |
article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only) |
| 50 | 87 |
article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only) |
| 51 | 88 |
article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only) |
| 52 | 89 |
article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only) |
| 90 |
+ article_only = re.sub(r"style=\"min-height:250px\"", 'style=\"min-height:1px\"', article_only) |
|
| 53 | 91 |
article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only) |
| 54 | 92 |
article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only) |
| 55 | 93 |
article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only) |