...
|
...
|
@@ -78,6 +78,18 @@ foreach ($articles as $article ) {
|
78
|
78
|
$article_only = re_remove($article_only, '/<div id="sponsor-slug" (.+?)><p>Supported by<\/p><\/div>/');
|
79
|
79
|
$article_only = re_remove($article_only, '/<div class="ad top-wrapper" style="text-align:center;height:100%;display:block;min-height:250px"><div id="top"><\/div><\/div>/');
|
80
|
80
|
$article_only = re_remove($article_only, '/<div class="ad sponsor-wrapper" style="text-align:center;height:100%;display:block"><div id="sponsor"><\/div><\/div>/');
|
|
81
|
+ $article_only = re_remove($article_only, '/<div data-testid="lazyimage-container" style="(.+?)"><\/div>/');
|
|
82
|
+ $article_only = re_remove($article_only, '/<span class="(.+?)">Image<\/span>/');
|
|
83
|
+ $article_only = re_remove($article_only, '/<div><\/div>/');
|
|
84
|
+ $article_only = re_remove($article_only, '/<aside class="(.+?)"><\/aside>/');
|
|
85
|
+ $article_only = re_remove($article_only, '/<div id="top-slug" class="(.+?)"><p>Advertisement<\/p><\/div>/');
|
|
86
|
+ $article_only = re_remove($article_only, '/<span class="(.+?)">Video<\/span>/');
|
|
87
|
+ $article_only = re_remove($article_only, '/<span>\. <a href="http:\/\/www\.nytreprints\.com\/">Order Reprints<\/a> \| <a href="http:\/\/www\.nytimes\.com\/pages\/todayspaper\/index\.html">Today’s Paper<\/a> \| <a href="https:\/\/www\.nytimes\.com\/subscriptions\/Multiproduct\/(.+?)">Subscribe<\/a><\/span>/');
|
|
88
|
+ $article_only = re_remove($article_only, '/<div id="bottom-slug" class="(.+?)"><p>Advertisement<\/p><\/div>/');
|
|
89
|
+ $article_only = re_remove($article_only, '/define\((.+?)}\);/');
|
|
90
|
+ $article_only = re_remove($article_only, '/var _gaq = _gaq \|\| \[\];/');
|
|
91
|
+ //$article_only = re_remove($article_only, '//');
|
|
92
|
+
|
81
|
93
|
|
82
|
94
|
//Some little replacements
|
83
|
95
|
$re = '/<div id="top-wrapper" class="ResponsiveAd-(.+?)">/';
|
...
|
...
|
@@ -88,6 +100,8 @@ foreach ($articles as $article ) {
|
88
|
100
|
$article_only = preg_replace($re, '<p>', $article_only);
|
89
|
101
|
$re = '/<h1 class=(.+?)><span>(.+?)<\/span><\/h1>/';
|
90
|
102
|
$article_only = preg_replace($re, '<h1>\2</h1>', $article_only);
|
|
103
|
+ $re = '/<h2 class="(.+?)>(.+?)<\/h2>/';
|
|
104
|
+ $article_only = preg_replace($re, '<h2>\2</h2>', $article_only);
|
91
|
105
|
$re = '/<h3 class=(.+?)>(.+?)<\/h3>/';
|
92
|
106
|
$article_only = preg_replace($re, '<h3>\2</h3>', $article_only);
|
93
|
107
|
$re = '/<div class="css-(.+?) StoryBodyCompanionColumn">/';
|
...
|
...
|
@@ -98,12 +112,15 @@ foreach ($articles as $article ) {
|
98
|
112
|
$article_only = preg_replace($re, '<h2>\2</h2>', $article_only);
|
99
|
113
|
$re = '/<div role="toolbar" aria-label="Social Media Share buttons, Save button, and Comments Panel with current comment count" class="css-(.+?)" data-testid="share-tools">/';
|
100
|
114
|
$article_only = preg_replace($re, '<div>', $article_only);
|
|
115
|
+ $re = '/<div class="bottom-of-article">/';
|
101
|
116
|
|
102
|
|
-
|
|
117
|
+ /*
|
103
|
118
|
$article_only = preg_replace('/<li class="css-(.+?)"><div><ul class="css-(.+?)">/', '<li>', $article_only);
|
104
|
119
|
$article_only = preg_replace('/<\/ul><\/div><\/li>/', '</li>', $article_only);
|
|
120
|
+ */
|
105
|
121
|
$article_only = re_remove($article_only, '/<div><button aria-haspopup="true" aria-expanded="false" (.+?)><\/button><\/div>/');
|
106
|
122
|
$article_only = re_remove($article_only, '/<a class="css-(.+?)" href="#site-content">Skip to content<\/a><a class="css-(.+?)" href="#site-index">Skip to site index<\/a>/');
|
|
123
|
+ $article_only = re_remove($article_only, '/<div><span class=""><i class="OpenCommentsButton-icon--(.+?)"><span class="OpenCommentsButton-text--(.+?)"><\/span><\/i><\/span><\/div>/');
|
107
|
124
|
|
108
|
125
|
//Finally remove empty lines
|
109
|
126
|
$article_only = preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "\n", $article_only);
|