Showing 4 changed files with 228 additions and 13 deletions
+12 -7
newsParser/__init__.py
... ...
@@ -43,6 +43,8 @@ from .newsParser import newsFrandroidCom
43 43
 from .newsParser import newsBuzzfeedCom
44 44
 from .newsParser import newsYahooCom
45 45
 from .newsParser import newsBFM
46
+from .newsParser import newsDefault
47
+from .newsParser import newsLNC
46 48
 # ~ from .newsParser import newsTodayOnlineSG
47 49
 
48 50
 def supportedList():
... ...
@@ -106,14 +108,14 @@ def articleElement(typeElement,content):
106 108
       except:
107 109
         indexElementBegin = 0
108 110
     try:
109
-      print("End Try: "+articleElementEnd)
111
+      #print("End Try: "+articleElementEnd)
110 112
       indexElementEnd   = content.index(articleElementEnd,indexElementBegin)
111 113
     except:
112 114
       try:
113
-        print("End Try: "+articleElementEnd2)
115
+        #print("End Try: "+articleElementEnd2)
114 116
         indexElementEnd   = content.index(articleElementEnd2,indexElementBegin)
115 117
       except:
116
-        print("End Try: "+articleElementEnd3)
118
+        #print("End Try: "+articleElementEnd3)
117 119
         indexElementEnd   = content.index(articleElementEnd3,indexElementBegin)
118 120
     element = content[indexElementBegin+len(articleElementBegin):indexElementEnd]
119 121
   #print("indexes: "+str(indexElementBegin)+" :: "+str(indexElementEnd))
... ...
@@ -205,9 +207,12 @@ def getArticle(url):
205 207
       data_page += newsParser.newsYahooCom.article(url)
206 208
     elif "bfmtv.com" in url:
207 209
       data_page += newsParser.newsBFM.article(url)
210
+    elif "lnc.nc" in url:
211
+      data_page += newsParser.newsLNC.article(url)
208 212
     else:
209
-       data_page += "<p>Unsupported News, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
210
-       data_page += "<p>Supported News:"
211
-       data_page += supportedList()
212
-       data_page += "</p>\n"
213
+       data_page += "<p>Generic Extraction, click to open <a href=\""+url+"\" target=\"new\">original link</a></p>\n"
214
+       #data_page += "<p>Supported News:"
215
+       #data_page += supportedList()
216
+       #data_page += "</p>\n"
217
+       data_page += newsParser.newsDefault.article(url)
213 218
   return data_page
+83
newsParser/newsParser/newsDefault.py
... ...
@@ -0,0 +1,83 @@
1
+from userio import *
2
+import requests
3
+import re
4
+
5
+def articleImage(content):
6
+  articleImgBegin ="<meta property=\"og:image\" content=\""
7
+  articleImgEnd   ="\""
8
+  indexImgBegin = content.index(articleImgBegin)
9
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
10
+  try:
11
+    image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  except:
13
+    image = "favicon.png"
14
+  return image
15
+
16
+def articleDescription(content):
17
+  articleImgBegin ="<meta property=\"og:description\" content=\""
18
+  articleImgEnd   ="\""
19
+  indexImgBegin = content.index(articleImgBegin)
20
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
21
+  try:
22
+    title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
23
+  except:
24
+    title = "Description Extraction Failed"
25
+  return title
26
+
27
+def articleTitle(content):
28
+  #articleImgBegin ="<meta property=\"og:title\" content=\""
29
+  articleImgBegin ="\"og:title\" content=\""
30
+  articleImgEnd   ="\""
31
+  indexImgBegin = content.index(articleImgBegin)
32
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
33
+  try:
34
+    title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
35
+  except:
36
+    title = "Title Extraction Failed"
37
+  return title
38
+
39
+def article(url):
40
+  say("ArticleDefault: "+url)
41
+  r = requests.get(url, allow_redirects=True)
42
+  content = r.text
43
+
44
+  articleStrImageUrl = articleImage(content)
45
+  articleStrTitle = articleTitle(content)
46
+  articleStrDescription = articleDescription(content)
47
+  
48
+  pageContent = ""
49
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
50
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
51
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
52
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
53
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
54
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
55
+  
56
+  articleCstBegin = "<article"
57
+  articleCstEnd   = "</article>"
58
+  articleCstBegin2 = "<body"
59
+  articleCstEnd2   = "</body>"
60
+  try:
61
+    indexBegin = content.index(articleCstBegin)
62
+  except:
63
+    try:
64
+      indexBegin = content.index(articleCstBegin2)
65
+    except:
66
+      indexBegin = 0
67
+  try:
68
+    indexEnd   = content.index(articleCstEnd)
69
+  except:
70
+    try:
71
+      indexEnd = content.index(articleCstEnd2)
72
+    except:
73
+      indexEnd = strlen(content)
74
+  article_only = content[indexBegin:indexEnd]
75
+  article_only = re.sub(r"<amp-img", '<img', article_only)
76
+  article_only = re.sub(r"</amp-img>", '', article_only)
77
+  article_only = re.sub(r"<h2", '<h3', article_only)
78
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
79
+  article_only = re.sub(r"<h1", '<h2', article_only)
80
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
81
+  article_only = article_only.replace("><", ">\n<")
82
+  pageContent += "<article>"+article_only+"</article>"
83
+  return pageContent
+89
newsParser/newsParser/newsLNC.py
... ...
@@ -0,0 +1,89 @@
1
+from userio import *
2
+import requests
3
+import re
4
+
5
+def articleImage(content):
6
+  articleImgBegin ="<meta property=\"og:image\" content=\""
7
+  articleImgEnd   ="\""
8
+  indexImgBegin = content.index(articleImgBegin)
9
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
10
+  try:
11
+    image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
+  except:
13
+    image = "favicon.png"
14
+  return image
15
+
16
+def articleDescription(content):
17
+  articleImgBegin ="<meta property=\"og:description\" content=\""
18
+  articleImgEnd   ="\""
19
+  indexImgBegin = content.index(articleImgBegin)
20
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
21
+  try:
22
+    title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
23
+  except:
24
+    title = "Description Extraction Failed"
25
+  return title
26
+
27
+def articleTitle(content):
28
+  #articleImgBegin ="<meta property=\"og:title\" content=\""
29
+  articleImgBegin ="\"og:title\" content=\""
30
+  articleImgEnd   ="\""
31
+  indexImgBegin = content.index(articleImgBegin)
32
+  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
33
+  try:
34
+    title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
35
+  except:
36
+    title = "Title Extraction Failed"
37
+  return title
38
+
39
+def article(url):
40
+  say("ArticleDefault: "+url)
41
+  r = requests.get(url, allow_redirects=True)
42
+  content = r.text
43
+
44
+  articleStrImageUrl = articleImage(content)
45
+  articleStrTitle = articleTitle(content)
46
+  articleStrDescription = articleDescription(content)
47
+  
48
+  pageContent = ""
49
+  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
50
+  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
51
+  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
52
+  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
53
+  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
54
+  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
55
+  
56
+  #articleCstBegin = "<article"
57
+  #articleCstEnd   = "</article>"
58
+  articleCstBegin = "<div class=\"middle-main-content\">"
59
+  articleCstEnd   = "<div id=\"IOSdialog\""
60
+  articleCstBegin2 = "<body"
61
+  articleCstEnd2   = "</body>"
62
+  try:
63
+    indexBegin = content.index(articleCstBegin)
64
+  except:
65
+    try:
66
+      indexBegin = content.index(articleCstBegin2)
67
+    except:
68
+      indexBegin = 0
69
+  try:
70
+    indexEnd   = content.index(articleCstEnd)
71
+  except:
72
+    try:
73
+      indexEnd = content.index(articleCstEnd2)
74
+    except:
75
+      indexEnd = strlen(content)
76
+  #<a href="http://l.lnc.nc/changan" target="_blank">
77
+  article_only = content[indexBegin:indexEnd]
78
+  article_only = re.sub(r"<amp-img", '<img', article_only)
79
+  article_only = re.sub(r"</amp-img>", '', article_only)
80
+  article_only = re.sub(r"<h2", '<h3', article_only)
81
+  article_only = re.sub(r"</h2>", '</h3>', article_only)
82
+  article_only = re.sub(r"<h1", '<h2', article_only)
83
+  article_only = re.sub(r"</h1>", '</h2>', article_only)
84
+  article_only = re.sub(r"<a href=\"http://l\.lnc\.nc/changan\" target=\"_blank\"(.+)</a>", '', article_only)
85
+  article_only = re.sub(r"<div class=\"col-md col-md-8\">", '<div>', article_only)
86
+  article_only = re.sub(r"<div class=\"tm_center_widget\">", '<div>', article_only)
87
+  article_only = article_only.replace("><", ">\n<")
88
+  pageContent += "<article>"+article_only+"</article>"
89
+  return pageContent
+44 -6
newsParser/newsParser/newsWaPo.py
... ...
@@ -3,15 +3,38 @@ import requests
3 3
 import re
4 4
 import newsParser
5 5
 
6
+def localArticleTitle(content):
7
+  articleElementBegin="<meta property=\"og:title\" content=\""
8
+  articleElementEnd  ="\"/>"
9
+  indexElementBegin  = content.index(articleElementBegin)
10
+  indexElementEnd    = content.index(articleElementEnd,indexElementBegin)
11
+  return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
12
+
13
+def localArticleDescription(content):
14
+  articleElementBegin="<meta property=\"og:description\" content=\""
15
+  articleElementEnd  ="\"/>"
16
+  indexElementBegin  = content.index(articleElementBegin)
17
+  indexElementEnd    = content.index(articleElementEnd,indexElementBegin)
18
+  return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
19
+  return ""
20
+
21
+def localArticleImage(content):
22
+  articleElementBegin="<meta property=\"og:image\" content=\""
23
+  articleElementEnd  ="\"/>"
24
+  indexElementBegin  = content.index(articleElementBegin)
25
+  indexElementEnd    = content.index(articleElementEnd,indexElementBegin)
26
+  return content[indexElementBegin+len(articleElementBegin):indexElementEnd]
27
+  return ""
28
+
6 29
 def article(url):
7 30
   say("Article: "+url)
8 31
   r = requests.get(url, allow_redirects=True)
9 32
   r.encoding = r.apparent_encoding
10 33
   content = r.text
11 34
   
12
-  articleStrImageUrl = newsParser.articleImage(content)
13
-  articleStrTitle = newsParser.articleTitle(content)
14
-  articleStrDescription = newsParser.articleDescription(content)
35
+  articleStrImageUrl = localArticleImage(content)
36
+  articleStrTitle = localArticleTitle(content)
37
+  articleStrDescription = localArticleDescription(content)
15 38
   articleStrImageUrl = re.sub(r"https://www\.washingtonpost\.com/wp-apps/imrs\.php\?src=(.+)&.+", r"\g<1>", articleStrImageUrl)
16 39
   
17 40
   pageContent = ""
... ...
@@ -26,16 +49,28 @@ def article(url):
26 49
   articleCstBegin = "<article"
27 50
   articleCstEnd   = "<div class=\"mt-md\">"
28 51
   articleCstEnd2   = "</article>"
52
+  articleCstEnd3   = "<div class=\"flex mt-md\">"
29 53
   indexBegin = content.index(articleCstBegin)
30 54
   try:
31 55
     indexEnd  = content.index(articleCstEnd)
32 56
   except:
33
-    indexEnd  = content.index(articleCstEnd2) 
57
+    try:
58
+      indexEnd  = content.index(articleCstEnd2) 
59
+    except:
60
+      indexEnd = content.index(articleCstEnd3)
61
+  debug("indexBegin: "+str(indexBegin))
62
+  debug("indexEnd  : "+str(indexEnd))
63
+  say("Title: "+articleStrTitle)
64
+  say("Image: "+articleStrImageUrl)
34 65
 
35 66
 
36 67
   article_only = "<h2>"+articleStrTitle+"</h2>"
37
-  article_only = "<img src=\""+articleStrImageUrl+"\">"
68
+  article_only += "<img src=\""+articleStrImageUrl+"\">"
69
+  article_only += "<em>"+articleStrDescription+"</em>"
38 70
 
71
+  with open("titi.html", "w") as f2:
72
+      f2.write(content[indexBegin:indexEnd])
73
+      f2.close
39 74
   article_only += content[indexBegin:indexEnd]
40 75
   article_only = re.sub(r"<amp-img", '<img', article_only)
41 76
   article_only = re.sub(r"</amp-img>", '', article_only)
... ...
@@ -45,11 +80,14 @@ def article(url):
45 80
   article_only = re.sub(r"</h1>", '</h2>', article_only)
46 81
   # ~ article_only = re.sub(r"<div data-sc-v=\"4.23.4\" data-sc-c=\"placeholder\">Advertisement</div>", '</h2>', article_only)
47 82
   article_only = re.sub(r"<div data-sc-v=\"4\.24\.3\" data-sc-c=\"placeholder\">Advertisement</div>", '', article_only)
48
-  article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only)
83
+  #article_only = re.sub(r"<div class=\"dib bg-white pl-xs pr-xs font-sans-serif light font-xxxxs lh-md gray-dark\" data-sc-v=\"4\.24\.3\" data-sc-c=\"adslot\">Story continues below advertisement</div>", '', article_only)
84
+  article_only = re.sub(r"<div class=\"dib flex divider.+?data-sc-c=\"adslot\">Story continues below advertisement</div>","", article_only)
85
+
49 86
   article_only = re.sub(r"style=\"width:300px;height:250px\"", 'style=\"width:1px;height:1px\"', article_only)
50 87
   article_only = re.sub(r"style=\"width:120px;height:32px\"", 'style=\"width:1px;height:1px\"', article_only)
51 88
   article_only = re.sub(r"style=\"width:136px;height:20px\"", 'style=\"width:1px;height:1px\"', article_only)
52 89
   article_only = re.sub(r"style=\"width:300px;height:600px\"", 'style=\"width:1px;height:1px\"', article_only)
90
+  article_only = re.sub(r"style=\"min-height:250px\"", 'style=\"min-height:1px\"', article_only)
53 91
   article_only = re.sub(r"style=\"min-height:298px\"", 'style=\"min-height:1px\"', article_only)
54 92
   article_only = re.sub(r"style=\"min-height:600px\"", 'style=\"min-height:1px\"', article_only)
55 93
   article_only = re.sub(r"class=\"center absolute w-100\" style=\"top:-12px\"", '', article_only)