Showing 1 changed files with 0 additions and 89 deletions
-89
newsParser/newsParser/newsLNC.py
... ...
@@ -1,89 +0,0 @@
1
-from userio import *
2
-import requests
3
-import re
4
-
5
-def articleImage(content):
6
-  articleImgBegin ="<meta property=\"og:image\" content=\""
7
-  articleImgEnd   ="\""
8
-  indexImgBegin = content.index(articleImgBegin)
9
-  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
10
-  try:
11
-    image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
12
-  except:
13
-    image = "favicon.png"
14
-  return image
15
-
16
-def articleDescription(content):
17
-  articleImgBegin ="<meta property=\"og:description\" content=\""
18
-  articleImgEnd   ="\""
19
-  indexImgBegin = content.index(articleImgBegin)
20
-  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
21
-  try:
22
-    title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
23
-  except:
24
-    title = "Description Extraction Failed"
25
-  return title
26
-
27
-def articleTitle(content):
28
-  #articleImgBegin ="<meta property=\"og:title\" content=\""
29
-  articleImgBegin ="\"og:title\" content=\""
30
-  articleImgEnd   ="\""
31
-  indexImgBegin = content.index(articleImgBegin)
32
-  indexImgEnd   = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
33
-  try:
34
-    title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
35
-  except:
36
-    title = "Title Extraction Failed"
37
-  return title
38
-
39
-def article(url):
40
-  say("ArticleDefault: "+url)
41
-  r = requests.get(url, allow_redirects=True)
42
-  content = r.text
43
-
44
-  articleStrImageUrl = articleImage(content)
45
-  articleStrTitle = articleTitle(content)
46
-  articleStrDescription = articleDescription(content)
47
-  
48
-  pageContent = ""
49
-  pageContent += "<meta property=\"og:type\" content=\"article\">\n"
50
-  pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
51
-  pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
52
-  pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
53
-  pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
54
-  pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
55
-  
56
-  #articleCstBegin = "<article"
57
-  #articleCstEnd   = "</article>"
58
-  articleCstBegin = "<div class=\"middle-main-content\">"
59
-  articleCstEnd   = "<div id=\"IOSdialog\""
60
-  articleCstBegin2 = "<body"
61
-  articleCstEnd2   = "</body>"
62
-  try:
63
-    indexBegin = content.index(articleCstBegin)
64
-  except:
65
-    try:
66
-      indexBegin = content.index(articleCstBegin2)
67
-    except:
68
-      indexBegin = 0
69
-  try:
70
-    indexEnd   = content.index(articleCstEnd)
71
-  except:
72
-    try:
73
-      indexEnd = content.index(articleCstEnd2)
74
-    except:
75
-      indexEnd = strlen(content)
76
-  #<a href="http://l.lnc.nc/changan" target="_blank">
77
-  article_only = content[indexBegin:indexEnd]
78
-  article_only = re.sub(r"<amp-img", '<img', article_only)
79
-  article_only = re.sub(r"</amp-img>", '', article_only)
80
-  article_only = re.sub(r"<h2", '<h3', article_only)
81
-  article_only = re.sub(r"</h2>", '</h3>', article_only)
82
-  article_only = re.sub(r"<h1", '<h2', article_only)
83
-  article_only = re.sub(r"</h1>", '</h2>', article_only)
84
-  article_only = re.sub(r"<a href=\"http://l\.lnc\.nc/changan\" target=\"_blank\"(.+)</a>", '', article_only)
85
-  article_only = re.sub(r"<div class=\"col-md col-md-8\">", '<div>', article_only)
86
-  article_only = re.sub(r"<div class=\"tm_center_widget\">", '<div>', article_only)
87
-  article_only = article_only.replace("><", ">\n<")
88
-  pageContent += "<article>"+article_only+"</article>"
89
-  return pageContent