...
|
...
|
@@ -1,89 +0,0 @@
|
1
|
|
-from userio import *
|
2
|
|
-import requests
|
3
|
|
-import re
|
4
|
|
-
|
5
|
|
-def articleImage(content):
|
6
|
|
- articleImgBegin ="<meta property=\"og:image\" content=\""
|
7
|
|
- articleImgEnd ="\""
|
8
|
|
- indexImgBegin = content.index(articleImgBegin)
|
9
|
|
- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
|
10
|
|
- try:
|
11
|
|
- image = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
|
12
|
|
- except:
|
13
|
|
- image = "favicon.png"
|
14
|
|
- return image
|
15
|
|
-
|
16
|
|
-def articleDescription(content):
|
17
|
|
- articleImgBegin ="<meta property=\"og:description\" content=\""
|
18
|
|
- articleImgEnd ="\""
|
19
|
|
- indexImgBegin = content.index(articleImgBegin)
|
20
|
|
- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
|
21
|
|
- try:
|
22
|
|
- title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
|
23
|
|
- except:
|
24
|
|
- title = "Description Extraction Failed"
|
25
|
|
- return title
|
26
|
|
-
|
27
|
|
-def articleTitle(content):
|
28
|
|
- #articleImgBegin ="<meta property=\"og:title\" content=\""
|
29
|
|
- articleImgBegin ="\"og:title\" content=\""
|
30
|
|
- articleImgEnd ="\""
|
31
|
|
- indexImgBegin = content.index(articleImgBegin)
|
32
|
|
- indexImgEnd = content.index(articleImgEnd,indexImgBegin+len(articleImgBegin))
|
33
|
|
- try:
|
34
|
|
- title = content[indexImgBegin+len(articleImgBegin):indexImgEnd]
|
35
|
|
- except:
|
36
|
|
- title = "Title Extraction Failed"
|
37
|
|
- return title
|
38
|
|
-
|
39
|
|
-def article(url):
|
40
|
|
- say("ArticleDefault: "+url)
|
41
|
|
- r = requests.get(url, allow_redirects=True)
|
42
|
|
- content = r.text
|
43
|
|
-
|
44
|
|
- articleStrImageUrl = articleImage(content)
|
45
|
|
- articleStrTitle = articleTitle(content)
|
46
|
|
- articleStrDescription = articleDescription(content)
|
47
|
|
-
|
48
|
|
- pageContent = ""
|
49
|
|
- pageContent += "<meta property=\"og:type\" content=\"article\">\n"
|
50
|
|
- pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
|
51
|
|
- pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
|
52
|
|
- pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
|
53
|
|
- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
|
54
|
|
- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
|
55
|
|
-
|
56
|
|
- #articleCstBegin = "<article"
|
57
|
|
- #articleCstEnd = "</article>"
|
58
|
|
- articleCstBegin = "<div class=\"middle-main-content\">"
|
59
|
|
- articleCstEnd = "<div id=\"IOSdialog\""
|
60
|
|
- articleCstBegin2 = "<body"
|
61
|
|
- articleCstEnd2 = "</body>"
|
62
|
|
- try:
|
63
|
|
- indexBegin = content.index(articleCstBegin)
|
64
|
|
- except:
|
65
|
|
- try:
|
66
|
|
- indexBegin = content.index(articleCstBegin2)
|
67
|
|
- except:
|
68
|
|
- indexBegin = 0
|
69
|
|
- try:
|
70
|
|
- indexEnd = content.index(articleCstEnd)
|
71
|
|
- except:
|
72
|
|
- try:
|
73
|
|
- indexEnd = content.index(articleCstEnd2)
|
74
|
|
- except:
|
75
|
|
- indexEnd = strlen(content)
|
76
|
|
- #<a href="http://l.lnc.nc/changan" target="_blank">
|
77
|
|
- article_only = content[indexBegin:indexEnd]
|
78
|
|
- article_only = re.sub(r"<amp-img", '<img', article_only)
|
79
|
|
- article_only = re.sub(r"</amp-img>", '', article_only)
|
80
|
|
- article_only = re.sub(r"<h2", '<h3', article_only)
|
81
|
|
- article_only = re.sub(r"</h2>", '</h3>', article_only)
|
82
|
|
- article_only = re.sub(r"<h1", '<h2', article_only)
|
83
|
|
- article_only = re.sub(r"</h1>", '</h2>', article_only)
|
84
|
|
- article_only = re.sub(r"<a href=\"http://l\.lnc\.nc/changan\" target=\"_blank\"(.+)</a>", '', article_only)
|
85
|
|
- article_only = re.sub(r"<div class=\"col-md col-md-8\">", '<div>', article_only)
|
86
|
|
- article_only = re.sub(r"<div class=\"tm_center_widget\">", '<div>', article_only)
|
87
|
|
- article_only = article_only.replace("><", ">\n<")
|
88
|
|
- pageContent += "<article>"+article_only+"</article>"
|
89
|
|
- return pageContent
|