Showing 1 changed files with 9 additions and 2 deletions
+9 -2
newsParser/newsParser/newsWaPo.py
... ...
@@ -74,8 +74,15 @@ def article(url):
74 74
   article_only = re.sub(r'<div style="filter:blur(.+?)" class="w-100 mw-100 h-auto" width="600" height="(.+?)">','<div><h2>'+articleStrTitle+'</h2>',article_only)
75 75
   article_only = re.sub(r'<div style="min-height:358px"/>','<div>',article_only)
76 76
   #article_only = re.sub(r'','',article_only)
77
-  article_only = re.sub(r"href=\"/",'href="https://xxxxx/',article_only)
78
-  article_only = re.sub(r"src=\"/",'src="https://xxxxx/',article_only)
77
+  say("LenCurrent: "+str(len(article_only)))
78
+  article_only = re.sub(r'<div class="article-body grid-center grid-body" data-qa="article-body">','<div>',article_only)
79
+  article_only = re.sub(r'<div data-qa="article-image" class="hide-for-print">','<div>',article_only)
80
+  article_only = re.sub(r'<div class="article-body grid-full-bleed" data-qa="article-body">','<div>',article_only)
81
+  article_only = re.sub(r'<div class="dib gray-dark pl-xs pr-xs font-sans-serif light font-xxxxs lh-md" style="--primary-border-color:"/>','',article_only)
82
+  #article_only = re.sub(r'<div class="article-body grid-center grid-body" data-qa="article-body">(.?)</div>', r'\1',article_only,flags=re.M|re.S)
83
+  say("LenCurrent: "+str(len(article_only)))
84
+  #article_only = re.sub(r"href=\"/",'href="https://xxxxx/',article_only)
85
+  #article_only = re.sub(r"src=\"/",'src="https://xxxxx/',article_only)
79 86
   article_only = re.sub(r"^$",'',article_only)
80 87
   article_only = re.sub(r'^\s*$', '',article_only,flags=re.M|re.S)
81 88
   article_only = re.sub(r"><",'>\n<',article_only)