...
|
...
|
@@ -92,49 +92,3 @@ def article(url):
|
92
|
92
|
say("Length: "+str(len(article_only)))
|
93
|
93
|
return pageContent
|
94
|
94
|
|
95
|
|
-def articleOld(url):
|
96
|
|
- say("Article: "+url)
|
97
|
|
- r = requests.get(url, allow_redirects=True)
|
98
|
|
- content = r.text
|
99
|
|
- r.html.find('article')
|
100
|
|
-
|
101
|
|
- articleStrImageUrl = articleImage(content)
|
102
|
|
- articleStrTitle = articleTitle(content)
|
103
|
|
- articleStrDescription = articleDescription(content)
|
104
|
|
-
|
105
|
|
- pageContent = ""
|
106
|
|
- pageContent += "<meta property=\"og:type\" content=\"article\">\n"
|
107
|
|
- pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
|
108
|
|
- pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
|
109
|
|
- pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
|
110
|
|
- pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
|
111
|
|
- pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">\n"
|
112
|
|
-
|
113
|
|
- articleCstBegin = "<article"
|
114
|
|
- articleCstEnd = "</article>"
|
115
|
|
- articleCstBegin2 = "<body"
|
116
|
|
- articleCstEnd2 = "</body>"
|
117
|
|
- try:
|
118
|
|
- indexBegin = content.index(articleCstBegin)
|
119
|
|
- except:
|
120
|
|
- try:
|
121
|
|
- indexBegin = content.index(articleCstBegin2)
|
122
|
|
- except:
|
123
|
|
- indexBegin = 0
|
124
|
|
- try:
|
125
|
|
- indexEnd = content.index(articleCstEnd)
|
126
|
|
- except:
|
127
|
|
- try:
|
128
|
|
- indexEnd = content.index(articleCstEnd2)
|
129
|
|
- except:
|
130
|
|
- indexEnd = strlen(content)
|
131
|
|
- article_only = content[indexBegin:indexEnd]
|
132
|
|
- article_only = re.sub(r"<amp-img", '<img', article_only)
|
133
|
|
- article_only = re.sub(r"</amp-img>", '', article_only)
|
134
|
|
- article_only = re.sub(r"<h2", '<h3', article_only)
|
135
|
|
- article_only = re.sub(r"</h2>", '</h3>', article_only)
|
136
|
|
- article_only = re.sub(r"<h1", '<h2', article_only)
|
137
|
|
- article_only = re.sub(r"</h1>", '</h2>', article_only)
|
138
|
|
- article_only = article_only.replace("><", ">\n<")
|
139
|
|
- pageContent += "<article>"+article_only+"</article>"
|
140
|
|
- return pageContent
|