...
|
...
|
@@ -57,44 +57,3 @@ def article(url):
|
57
|
57
|
say("LengthAfter : "+str(lenAfter))
|
58
|
58
|
say("Gain : "+str(lenGain)+"%")
|
59
|
59
|
return pageContent
|
60
|
|
-
|
61
|
|
-
|
62
|
|
-def articleOld(url):
|
63
|
|
- say("Article: "+url)
|
64
|
|
- r = requests.get(url, allow_redirects=True)
|
65
|
|
- content = r.text
|
66
|
|
- pageContent = ""
|
67
|
|
- articleStrTitle = newsParser.articleTitle(content)
|
68
|
|
- articleStrImageUrl = newsParser.articleImage(content)
|
69
|
|
- articleStrDescription = newsParser.articleDescription(content)
|
70
|
|
- articleCstBegin = "<div class=\"content_body\">"
|
71
|
|
- articleCstEnd = "<div class=\"content_body\" id=\"content_body_bottom\">"
|
72
|
|
-
|
73
|
|
- pageContent += "<h2>"+articleStrTitle+"</h2>\n"
|
74
|
|
- pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
|
75
|
|
- pageContent += "<em>"+articleStrDescription+"</em>\n"
|
76
|
|
-
|
77
|
|
-
|
78
|
|
- pageContent = ""
|
79
|
|
- pageContent += "<meta property=\"og:type\" content=\"article\">\n"
|
80
|
|
- pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
|
81
|
|
- pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
|
82
|
|
- pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
|
83
|
|
- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
|
84
|
|
- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
|
85
|
|
-
|
86
|
|
- indexBegin = content.index(articleCstBegin)
|
87
|
|
- indexEnd = content.index(articleCstEnd)
|
88
|
|
- article_only = ""
|
89
|
|
- article_only = content[indexBegin:indexEnd]
|
90
|
|
- article_only = re.sub(r"<amp-img", '<img', article_only)
|
91
|
|
- article_only = re.sub(r"</amp-img>", '', article_only)
|
92
|
|
- article_only = re.sub(r"<h2", '<h3', article_only)
|
93
|
|
- article_only = re.sub(r"</h2>", '</h3>', article_only)
|
94
|
|
- article_only = re.sub(r"<h1", '<h2', article_only)
|
95
|
|
- article_only = re.sub(r"</h1>", '</h2>', article_only)
|
96
|
|
-
|
97
|
|
- article_only = re.sub(r"href=\"\/", 'href=\"//www.bfmtv.com/', article_only)
|
98
|
|
- pageContent += "<article>"+article_only+"</article>"
|
99
|
|
- pageContent = pageContent.replace("><", ">\n<")
|
100
|
|
- return pageContent
|