...
|
...
|
@@ -35,7 +35,8 @@ def article(url):
|
35
|
35
|
article_only = re.sub(r"</h2>", '</h3>', article_only)
|
36
|
36
|
article_only = re.sub(r"<h1", '<h2', article_only)
|
37
|
37
|
article_only = re.sub(r"</h1>", '</h2>', article_only)
|
38
|
|
- article_only = re.sub(r'<script(.+?)</script>','',article_only)
|
|
38
|
+ article_only = re.sub(r'<script(.+?)</script>','',article_only,flags=re.M|re.S)
|
|
39
|
+ article_only = re.sub(r'<script(.+?)/>','',article_only)
|
39
|
40
|
#article_only = re.sub(r'','',article_only)
|
40
|
41
|
article_only = re.sub(r"href=\"/",'href="https://xxxxx/',article_only)
|
41
|
42
|
article_only = re.sub(r"src=\"/",'src="https://xxxxx/',article_only)
|
...
|
...
|
@@ -43,6 +44,7 @@ def article(url):
|
43
|
44
|
article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S)
|
44
|
45
|
article_only = re.sub(r"><",'>\n<',article_only)
|
45
|
46
|
|
|
47
|
+ #pageContent += "\n"+article_only+"\n"
|
46
|
48
|
pageContent += "<article>\n"+article_only+"\n</article>\n"
|
47
|
49
|
lenAfter=len(article_only)
|
48
|
50
|
lenGain=float(10000-int(float(100*lenAfter/lenBefore)*100))/100
|