Showing 1 changed files with 18 additions and 5 deletions
+18 -5
sources/news-numerama.php
... ...
@@ -61,12 +61,25 @@ foreach ($articles as $article ) {
61 61
   echo "<hr>";
62 62
   echo "<a name=\"article-$cpt\">";
63 63
   $article_content = file_get_contents($article['link']);
64
-  $SEARCH_SUB1='<article class="post-content span8 tablet12 phone12">';
65
-  $pos_start = strpos($article_content, $SEARCH_SUB1);
66
-  $pos_start += strlen($SEARCH_SUB1);
64
+  $doc = new DOMDocument();
65
+  $doc->preserveWhiteSpace = false;
66
+  $doc->formatOutput       = true;
67
+  $libxml_previous_state = libxml_use_internal_errors(true);
68
+  $doc->loadHTML($article_content);
69
+  libxml_clear_errors();
70
+  libxml_use_internal_errors($libxml_previous_state);
71
+  $articles = $doc->getElementsByTagName('article');
72
+  $article_only="";
73
+  if( isset($articles[0]) ) {
74
+    $article_only=DOMinnerHTML($articles[0]);
75
+  } else {
76
+    $article_only = "Extraction Failed for Subscribed Article";
77
+    WARNING($cpt." : ".$article_only);
78
+    break;
79
+  }
67 80
   $SEARCH_SUB2='<div id="bottom_ad"';
68
-  $pos_stop = strpos($article_content, $SEARCH_SUB2);
69
-  $article_only = substr($article_content, $pos_start, $pos_stop - $pos_start);
81
+  $pos_stop = strpos($article_only, $SEARCH_SUB2);
82
+  $article_only = substr($article_only, 0, $pos_stop);
70 83
 
71 84
   $orgStrings = array(' src="//www.numerama.com/');
72 85
   $newStrings = array(' src="https://www.numerama.com/');