Added first version of LNC.NC ・ 9e78dac ・ Gitprep

- Added first version of LNC.NC;
- Browse files
- ycawidro commited on 2022-01-31
- 1 parent 4e479c0
  
  commit 9e78dac073c03cd4ef05873f62361994d61f6e6f

Showing 4 changed files with 228 additions and 13 deletions

+12 -7

newsParser/__init__.py

@@ -43,6 +43,8 @@ from .newsParser import newsFrandroidCom
 from .newsParser import newsBuzzfeedCom
 from .newsParser import newsYahooCom
 from .newsParser import newsBFM
+from .newsParser import newsDefault
+from .newsParser import newsLNC
 # ~ from .newsParser import newsTodayOnlineSG
 
 def supportedList():
@@ -106,14 +108,14 @@ def articleElement(typeElement,content):
       except:
         indexElementBegin = 0
     try:
-      print("End Try: "+articleElementEnd)
+      #print("End Try: "+articleElementEnd)
       indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
     except:
       try:
-        print("End Try: "+articleElementEnd2)
+        #print("End Try: "+articleElementEnd2)
         indexElementEnd   = content.index(articleElementEnd2,indexElementBegin)
       except:
-        print("End Try: "+articleElementEnd3)
+        #print("End Try: "+articleElementEnd3)
         indexElementEnd   = content.index(articleElementEnd3,indexElementBegin)
     element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
   #print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd))
@@ -205,9 +207,12 @@ def getArticle(url):
       data_page += newsParser.newsYahooCom.article(url)
     elif "bfmtv.com" in url:
       data_page += newsParser.newsBFM.article(url)
+    elif "lnc.nc" in url:
+      data_page += newsParser.newsLNC.article(url)
     else:
-       data_page += "<p>Unsupported News, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
-       data_page += "<p>Supported News:"
-       data_page += supportedList()
-       data_page += "</p>\n"
+       data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
+       #data_page += "<p>Supported News:"
+       #data_page += supportedList()
+       #data_page += "</p>\n"
+       data_page += newsParser.newsDefault.article(url)
   return data_page

+83

newsParser/newsParser/newsDefault.py

View

...	...	@@ -0,0 +1,83 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+
	5	+def articleImage(content):
	6	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	7	+ articleImgEnd ="\""
	8	+ indexImgBegin = content.index(articleImgBegin)
	9	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	10	+ try:
	11	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ except:
	13	+ image = "favicon.png"
	14	+ return image
	15	+
	16	+def articleDescription(content):
	17	+ articleImgBegin ="<meta property=\"og:description\" content=\""
	18	+ articleImgEnd ="\""
	19	+ indexImgBegin = content.index(articleImgBegin)
	20	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	21	+ try:
	22	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	23	+ except:
	24	+ title = "Description Extraction Failed"
	25	+ return title
	26	+
	27	+def articleTitle(content):
	28	+ #articleImgBegin ="<meta property=\"og:title\" content=\""
	29	+ articleImgBegin ="\"og:title\" content=\""
	30	+ articleImgEnd ="\""
	31	+ indexImgBegin = content.index(articleImgBegin)
	32	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	33	+ try:
	34	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	35	+ except:
	36	+ title = "Title Extraction Failed"
	37	+ return title
	38	+
	39	+def article(url):
	40	+ say("ArticleDefault: "+url)
	41	+ r = requests.get(url, allow_redirects=True)
	42	+ content = r.text
	43	+
	44	+ articleStrImageUrl = articleImage(content)
	45	+ articleStrTitle = articleTitle(content)
	46	+ articleStrDescription = articleDescription(content)
	47	+
	48	+ pageContent = ""
	49	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	50	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	51	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	52	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	53	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	54	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	55	+
	56	+ articleCstBegin = "<article"
	57	+ articleCstEnd = "</article>"
	58	+ articleCstBegin2 = "<body"
	59	+ articleCstEnd2 = "</body>"
	60	+ try:
	61	+ indexBegin = content.index(articleCstBegin)
	62	+ except:
	63	+ try:
	64	+ indexBegin = content.index(articleCstBegin2)
	65	+ except:
	66	+ indexBegin = 0
	67	+ try:
	68	+ indexEnd = content.index(articleCstEnd)
	69	+ except:
	70	+ try:
	71	+ indexEnd = content.index(articleCstEnd2)
	72	+ except:
	73	+ indexEnd = strlen(content)
	74	+ article_only = content[indexBegin:indexEnd]
	75	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	76	+ article_only = re.sub(r"</amp-img>", '', article_only)
	77	+ article_only = re.sub(r"<h2", '<h3', article_only)
	78	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	79	+ article_only = re.sub(r"<h1", '<h2', article_only)
	80	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	81	+ article_only = article_only.replace("><", ">\n<")
	82	+ pageContent += "<article>"+article_only+"</article>"
	83	+ return pageContent

+89

newsParser/newsParser/newsLNC.py

View

...	...	@@ -0,0 +1,89 @@
	1	+from userio import *
	2	+import requests
	3	+import re
	4	+
	5	+def articleImage(content):
	6	+ articleImgBegin ="<meta property=\"og:image\" content=\""
	7	+ articleImgEnd ="\""
	8	+ indexImgBegin = content.index(articleImgBegin)
	9	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	10	+ try:
	11	+ image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	12	+ except:
	13	+ image = "favicon.png"
	14	+ return image
	15	+
	16	+def articleDescription(content):
	17	+ articleImgBegin ="<meta property=\"og:description\" content=\""
	18	+ articleImgEnd ="\""
	19	+ indexImgBegin = content.index(articleImgBegin)
	20	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	21	+ try:
	22	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	23	+ except:
	24	+ title = "Description Extraction Failed"
	25	+ return title
	26	+
	27	+def articleTitle(content):
	28	+ #articleImgBegin ="<meta property=\"og:title\" content=\""
	29	+ articleImgBegin ="\"og:title\" content=\""
	30	+ articleImgEnd ="\""
	31	+ indexImgBegin = content.index(articleImgBegin)
	32	+ indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
	33	+ try:
	34	+ title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
	35	+ except:
	36	+ title = "Title Extraction Failed"
	37	+ return title
	38	+
	39	+def article(url):
	40	+ say("ArticleDefault: "+url)
	41	+ r = requests.get(url, allow_redirects=True)
	42	+ content = r.text
	43	+
	44	+ articleStrImageUrl = articleImage(content)
	45	+ articleStrTitle = articleTitle(content)
	46	+ articleStrDescription = articleDescription(content)
	47	+
	48	+ pageContent = ""
	49	+ pageContent += "<meta property=\"og:type\" content=\"article\">\n"
	50	+ pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
	51	+ pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
	52	+ pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
	53	+ pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
	54	+ pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
	55	+
	56	+ #articleCstBegin = "<article"
	57	+ #articleCstEnd = "</article>"
	58	+ articleCstBegin = "<div class=\"middle-main-content\">"
	59	+ articleCstEnd = "<div id=\"IOSdialog\""
	60	+ articleCstBegin2 = "<body"
	61	+ articleCstEnd2 = "</body>"
	62	+ try:
	63	+ indexBegin = content.index(articleCstBegin)
	64	+ except:
	65	+ try:
	66	+ indexBegin = content.index(articleCstBegin2)
	67	+ except:
	68	+ indexBegin = 0
	69	+ try:
	70	+ indexEnd = content.index(articleCstEnd)
	71	+ except:
	72	+ try:
	73	+ indexEnd = content.index(articleCstEnd2)
	74	+ except:
	75	+ indexEnd = strlen(content)
	76	+ #<a href="http://l.lnc.nc/changan" target="_blank">
	77	+ article_only = content[indexBegin:indexEnd]
	78	+ article_only = re.sub(r"<amp-img", '<img', article_only)
	79	+ article_only = re.sub(r"</amp-img>", '', article_only)
	80	+ article_only = re.sub(r"<h2", '<h3', article_only)
	81	+ article_only = re.sub(r"</h2>", '</h3>', article_only)
	82	+ article_only = re.sub(r"<h1", '<h2', article_only)
	83	+ article_only = re.sub(r"</h1>", '</h2>', article_only)
	84	+ article_only = re.sub(r"<a href=\"http://l\.lnc\.nc/changan\" target=\"_blank\"(.+)</a>", '', article_only)
	85	+ article_only = re.sub(r"<div class=\"col-md col-md-8\">", '<div>', article_only)
	86	+ article_only = re.sub(r"<div class=\"tm_center_widget\">", '<div>', article_only)
	87	+ article_only = article_only.replace("><", ">\n<")
	88	+ pageContent += "<article>"+article_only+"</article>"
	89	+ return pageContent

+44 -6

newsParser/newsParser/newsWaPo.py

View

@@ -3,15 +3,38 @@ import requests
 import re
 import newsParser
 
+def localArticleTitle(content):
+  articleElementBegin="<meta property=\"og:title\" content=\""
+  articleElementEnd  ="\"/>"
+  indexElementBegin  = content.index(articleElementBegin)
+  indexElementEnd    = content.index(articleElementEnd,indexElementBegin)
+  return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
+
+def localArticleDescription(content):
+  articleElementBegin="<meta property=\"og:description\" content=\""
+  articleElementEnd  ="\"/>"
+  indexElementBegin  = content.index(articleElementBegin)
+  indexElementEnd    = content.index(articleElementEnd,indexElementBegin)
+  return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
+  return ""
+
+def localArticleImage(content):
+  articleElementBegin="<meta property=\"og:image\" content=\""
+  articleElementEnd  ="\"/>"
+  indexElementBegin  = content.index(articleElementBegin)
+  indexElementEnd    = content.index(articleElementEnd,indexElementBegin)
+  return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
+  return ""
+
 def article(url):
   say("Article: "+url)
   r = requests.get(url, allow_redirects=True)
   r.encoding = r.apparent_encoding
   content = r.text
   
-  articleStrImageUrl = newsParser.articleImage(content)
-  articleStrTitle = newsParser.articleTitle(content)
-  articleStrDescription = newsParser.articleDescription(content)
+  articleStrImageUrl = localArticleImage(content)
+  articleStrTitle = localArticleTitle(content)
+  articleStrDescription = localArticleDescription(content)
   articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl)
   
   pageContent = ""
@@ -26,16 +49,28 @@ def article(url):
   articleCstBegin = "<article"
   articleCstEnd   = "<div class=\"mt-md\">"
   articleCstEnd2   = "</article>"
+  articleCstEnd3   = "<div class=\"flex mt-md\">"
   indexBegin = content.index(articleCstBegin)
   try:
     indexEnd  = content.index(articleCstEnd)
   except:
-    indexEnd  = content.index(articleCstEnd2) 
+    try:
+      indexEnd  = content.index(articleCstEnd2) 
+    except:
+      indexEnd = content.index(articleCstEnd3)
+  debug("indexBegin: "+str(indexBegin))
+  debug("indexEnd  : "+str(indexEnd))
+  say("Title: "+articleStrTitle)
+  say("Image: "+articleStrImageUrl)
 
 
   article_only = "<h2>"+articleStrTitle+"</h2>"
-  article_only = "<img src=\""+articleStrImageUrl+"\">"
+  article_only += "<img src=\""+articleStrImageUrl+"\">"
+  article_only += "<em>"+articleStrDescription+"</em>"
 
+  with open("titi.html", "w") as f2:
+      f2.write(content[indexBegin:indexEnd])
+      f2.close
   article_only += content[indexBegin:indexEnd]
   article_only = re.sub(r"<amp-img", '<img', article_only)
   article_only = re.sub(r"</amp-img>", '', article_only)
@@ -45,11 +80,14 @@ def article(url):
   article_only = re.sub(r"</h1>", '</h2>', article_only)
   # ~ article_only = re.sub(r"<div data-sc-v=\"4.23.4\" data-sc-c=\"placeholder\">Advertisement</div>", '</h2>', article_only)
   article_only = re.sub(r"<div data-sc-v=\"4\.24\.3\" data-sc-c=\"placeholder\">Advertisement</div>", '', article_only)
-  article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only)
+  #article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only)
+  article_only = re.sub(r"<div class=\"dib flex divider.+?data-sc-c=\"adslot\">Story continues below advertisement</div>","", article_only)
+
   article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only)
   article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only)
   article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only)
   article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only)
+  article_only = re.sub(r"style=\"min-height:250px\"", 'style=\"min-height:1px\"', article_only)
   article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only)
   article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only)
   article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)

...	...	@@ -43,6 +43,8 @@ from .newsParser import newsFrandroidCom
43	43	from .newsParser import newsBuzzfeedCom
44	44	from .newsParser import newsYahooCom
45	45	from .newsParser import newsBFM
	46	+from .newsParser import newsDefault
	47	+from .newsParser import newsLNC
46	48	# ~ from .newsParser import newsTodayOnlineSG
47	49
48	50	def supportedList():
...	...	@@ -106,14 +108,14 @@ def articleElement(typeElement,content):
106	108	except:
107	109	indexElementBegin = 0
108	110	try:
109		- print("End Try: "+articleElementEnd)
	111	+ #print("End Try: "+articleElementEnd)
110	112	indexElementEnd = content.index(articleElementEnd,indexElementBegin)
111	113	except:
112	114	try:
113		- print("End Try: "+articleElementEnd2)
	115	+ #print("End Try: "+articleElementEnd2)
114	116	indexElementEnd = content.index(articleElementEnd2,indexElementBegin)
115	117	except:
116		- print("End Try: "+articleElementEnd3)
	118	+ #print("End Try: "+articleElementEnd3)
117	119	indexElementEnd = content.index(articleElementEnd3,indexElementBegin)
118	120	element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
119	121	#print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd))
...	...	@@ -205,9 +207,12 @@ def getArticle(url):
205	207	data_page += newsParser.newsYahooCom.article(url)
206	208	elif "bfmtv.com" in url:
207	209	data_page += newsParser.newsBFM.article(url)
	210	+ elif "lnc.nc" in url:
	211	+ data_page += newsParser.newsLNC.article(url)
208	212	else:
209		- data_page += "<p>Unsupported News, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
210		- data_page += "<p>Supported News:"
211		- data_page += supportedList()
212		- data_page += "</p>\n"
	213	+ data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
	214	+ #data_page += "<p>Supported News:"
	215	+ #data_page += supportedList()
	216	+ #data_page += "</p>\n"
	217	+ data_page += newsParser.newsDefault.article(url)
213	218	return data_page

...	...	@@ -3,15 +3,38 @@ import requests
3	3	import re
4	4	import newsParser
5	5
	6	+def localArticleTitle(content):
	7	+ articleElementBegin="<meta property=\"og:title\" content=\""
	8	+ articleElementEnd ="\"/>"
	9	+ indexElementBegin = content.index(articleElementBegin)
	10	+ indexElementEnd = content.index(articleElementEnd,indexElementBegin)
	11	+ return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
	12	+
	13	+def localArticleDescription(content):
	14	+ articleElementBegin="<meta property=\"og:description\" content=\""
	15	+ articleElementEnd ="\"/>"
	16	+ indexElementBegin = content.index(articleElementBegin)
	17	+ indexElementEnd = content.index(articleElementEnd,indexElementBegin)
	18	+ return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
	19	+ return ""
	20	+
	21	+def localArticleImage(content):
	22	+ articleElementBegin="<meta property=\"og:image\" content=\""
	23	+ articleElementEnd ="\"/>"
	24	+ indexElementBegin = content.index(articleElementBegin)
	25	+ indexElementEnd = content.index(articleElementEnd,indexElementBegin)
	26	+ return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
	27	+ return ""
	28	+
6	29	def article(url):
7	30	say("Article: "+url)
8	31	r = requests.get(url, allow_redirects=True)
9	32	r.encoding = r.apparent_encoding
10	33	content = r.text
11	34
12		- articleStrImageUrl = newsParser.articleImage(content)
13		- articleStrTitle = newsParser.articleTitle(content)
14		- articleStrDescription = newsParser.articleDescription(content)
	35	+ articleStrImageUrl = localArticleImage(content)
	36	+ articleStrTitle = localArticleTitle(content)
	37	+ articleStrDescription = localArticleDescription(content)
15	38	articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl)
16	39
17	40	pageContent = ""
...	...	@@ -26,16 +49,28 @@ def article(url):
26	49	articleCstBegin = "<article"
27	50	articleCstEnd = "<div class=\"mt-md\">"
28	51	articleCstEnd2 = "</article>"
	52	+ articleCstEnd3 = "<div class=\"flex mt-md\">"
29	53	indexBegin = content.index(articleCstBegin)
30	54	try:
31	55	indexEnd = content.index(articleCstEnd)
32	56	except:
33		- indexEnd = content.index(articleCstEnd2)
	57	+ try:
	58	+ indexEnd = content.index(articleCstEnd2)
	59	+ except:
	60	+ indexEnd = content.index(articleCstEnd3)
	61	+ debug("indexBegin: "+str(indexBegin))
	62	+ debug("indexEnd : "+str(indexEnd))
	63	+ say("Title: "+articleStrTitle)
	64	+ say("Image: "+articleStrImageUrl)
34	65
35	66
36	67	article_only = "<h2>"+articleStrTitle+"</h2>"
37		- article_only = "<img src=\""+articleStrImageUrl+"\">"
	68	+ article_only += "<img src=\""+articleStrImageUrl+"\">"
	69	+ article_only += "<em>"+articleStrDescription+"</em>"
38	70
	71	+ with open("titi.html", "w") as f2:
	72	+ f2.write(content[indexBegin:indexEnd])
	73	+ f2.close
39	74	article_only += content[indexBegin:indexEnd]
40	75	article_only = re.sub(r"<amp-img", '<img', article_only)
41	76	article_only = re.sub(r"</amp-img>", '', article_only)
...	...	@@ -45,11 +80,14 @@ def article(url):
45	80	article_only = re.sub(r"</h1>", '</h2>', article_only)
46	81	# ~ article_only = re.sub(r"<div data-sc-v=\"4.23.4\" data-sc-c=\"placeholder\">Advertisement</div>", '</h2>', article_only)
47	82	article_only = re.sub(r"<div data-sc-v=\"4\.24\.3\" data-sc-c=\"placeholder\">Advertisement</div>", '', article_only)
48		- article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only)
	83	+ #article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only)
	84	+ article_only = re.sub(r"<div class=\"dib flex divider.+?data-sc-c=\"adslot\">Story continues below advertisement</div>","", article_only)
	85	+
49	86	article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only)
50	87	article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only)
51	88	article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only)
52	89	article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only)
	90	+ article_only = re.sub(r"style=\"min-height:250px\"", 'style=\"min-height:1px\"', article_only)
53	91	article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only)
54	92	article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only)
55	93	article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)