...
|
...
|
@@ -91,13 +91,17 @@ foreach ($articles as $article ) {
|
91
|
91
|
$article_only = re_remove($article_only, '/<div id="bottom-slug" class="(.+?)"><p>Advertisement<\/p><\/div>/');
|
92
|
92
|
$article_only = re_remove($article_only, '/define\((.+?)}\);/');
|
93
|
93
|
$article_only = re_remove($article_only, '/var _gaq = _gaq \|\| \[\];/');
|
|
94
|
+ $article_only = re_remove($article_only, '/<div class="ad sponsor-wrapper" style="text-align:center;height:100%;display:block"><div id="sponsor" class="" data-position="sponsor"><\/div><\/div>/');
|
|
95
|
+ $article_only = re_remove($article_only, '/<div class="ad top-wrapper" style="text-align:center;height:100%;display:block;min-height:250px"><div id="top" class="place-ad" data-position="top"><\/div><\/div>/');
|
|
96
|
+ //$article_only = re_remove($article_only, '//');
|
|
97
|
+ //$article_only = re_remove($article_only, '//');
|
94
|
98
|
//$article_only = re_remove($article_only, '//');
|
95
|
99
|
|
96
|
100
|
|
97
|
101
|
//Some little replacements
|
98
|
102
|
$re = '/<div id="top-wrapper" class="ResponsiveAd-(.+?)">/';
|
99
|
103
|
$article_only = preg_replace($re, '<div>', $article_only);
|
100
|
|
- $re = '/<time class="css-(.+?)>(.+?)<\/time>/';
|
|
104
|
+ $re = '/<li><time class="css-(.+?)>(.+?)<\/time><\/li>/';
|
101
|
105
|
$article_only = preg_replace($re, '<time>\2</time>', $article_only);
|
102
|
106
|
$re = '/<p class="css-(.+?) (.+?)">/';
|
103
|
107
|
$article_only = preg_replace($re, '<p>', $article_only);
|
...
|
...
|
@@ -116,10 +120,42 @@ foreach ($articles as $article ) {
|
116
|
120
|
$re = '/<div role="toolbar" aria-label="Social Media Share buttons, Save button, and Comments Panel with current comment count" class="css-(.+?)" data-testid="share-tools">/';
|
117
|
121
|
$article_only = preg_replace($re, '<div>', $article_only);
|
118
|
122
|
$re = '/<div class="bottom-of-article">/';
|
|
123
|
+ $article_only = preg_replace($re, '<div>', $article_only);
|
|
124
|
+ $re = '/<header class="css-(.+?)"><h3>(.+?)<\/h3><\/header>/';
|
|
125
|
+ $article_only = preg_replace($re, '', $article_only);
|
|
126
|
+ $re = '/<header class="css-(.+?) (.+?)"><div id="sponsor-wrapper" class="css-(.+?)">/';
|
|
127
|
+ $article_only = preg_replace($re, '<header><div>', $article_only);
|
|
128
|
+ $re = '/<li><time>(.+?)<\/time><\/li>/';
|
|
129
|
+ $article_only = preg_replace($re, '<em>\1</em>', $article_only);
|
|
130
|
+ $re = '/<\/div><\/div><\/header><section name="articleBody" itemprop="articleBody" class="css-(.+?)"><div>/';
|
|
131
|
+ $article_only = preg_replace($re, '</div></div></header><section><div>', $article_only);
|
|
132
|
+ $re = '/<header><div>/';
|
|
133
|
+ $article_only = preg_replace($re, '<div>', $article_only);
|
|
134
|
+ $re = '/<\/div><\/div><\/header><section><div>/';
|
|
135
|
+ $article_only = preg_replace($re, '</div></div><section><div>', $article_only);
|
|
136
|
+ $re = '/<li class="css-(.+?)">/';
|
|
137
|
+ $article_only = preg_replace($re, '<li style="display: none;">', $article_only);
|
|
138
|
+ $re = '/<a href="(.+?)amp;module=RelatedCoverage&pgtype=Article&region=Footer" class="css-(.+?)"><div>/';
|
|
139
|
+ $article_only = preg_replace($re, '<a style="display: none;"><div>', $article_only);
|
|
140
|
+ $re = '/<div><figure class="toneNews" aria-label="media" role="group" itemscope="" itemprop="associatedMedia" itemid="(.+?)-thumbLarge.jpg" itemtype="http:\/\/schema\.org\/ImageObject"><div>/';
|
|
141
|
+ $article_only = preg_replace($re, '<div><figure><div>', $article_only);
|
|
142
|
+ $re = '/<div><figure><div>/';
|
|
143
|
+ $article_only = preg_replace($re, '<div>', $article_only);
|
|
144
|
+ $re = '/<\/figure><\/div>/';
|
|
145
|
+ $article_only = preg_replace($re, '', $article_only);
|
|
146
|
+ $re = '/<div id="bottom-wrapper" class="css-(.+?)">/';
|
|
147
|
+ $article_only = preg_replace($re, '<div>', $article_only);
|
119
|
148
|
|
120
|
149
|
$article_only = re_remove($article_only, '/<div><button aria-haspopup="true" aria-expanded="false" (.+?)><\/button><\/div>/');
|
121
|
150
|
$article_only = re_remove($article_only, '/<a class="css-(.+?)" href="#site-content">Skip to content<\/a><a class="css-(.+?)" href="#site-index">Skip to site index<\/a>/');
|
122
|
151
|
$article_only = re_remove($article_only, '/<div><span class=""><i class="OpenCommentsButton-icon--(.+?)"><span class="OpenCommentsButton-text--(.+?)"><\/span><\/i><\/span><\/div>/');
|
|
152
|
+ $article_only = re_remove($article_only, '/<p><\/p>/');
|
|
153
|
+ $article_only = re_remove($article_only, '/<div><h2>(.+?)<\/h2><\/div>/');
|
|
154
|
+ $article_only = re_remove($article_only, '/<time class="css-(.+?) (.+?)" datetime="(.+?)">(.+?)<\/time>/');
|
|
155
|
+ $article_only = re_remove($article_only, '/<div><button type="button" class="css-(.+?)">Show All<\/button><\/div>/');
|
|
156
|
+ $article_only = re_remove($article_only, '/<figcaption itemprop="caption description" class="css-(.+?) (.+?)"><\/figcaption>/');
|
|
157
|
+ $article_only = re_remove($article_only, '/<div class="ad bottom-wrapper" style="text-align:center;height:100%;display:block;min-height:90px"><div id="bottom" class="" data-position="bottom"><\/div><\/div>/');
|
|
158
|
+ //$article_only = re_remove($article_only, '//');
|
123
|
159
|
|
124
|
160
|
//Finally remove empty lines
|
125
|
161
|
$article_only = preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "\n", $article_only);
|