Numerama: Now extracted with DOM ・ 49bd985 ・ Gitprep

+18 -5

sources/news-numerama.php

@@ -61,12 +61,25 @@ foreach ($articles as $article ) {
   echo "<hr>";
   echo "<a name=\"article-$cpt\">";
   $article_content = file_get_contents($article['link']);
-  $SEARCH_SUB1='<article class="post-content span8 tablet12 phone12">';
-  $pos_start = strpos($article_content, $SEARCH_SUB1);
-  $pos_start += strlen($SEARCH_SUB1);
+  $doc = new DOMDocument();
+  $doc->preserveWhiteSpace = false;
+  $doc->formatOutput       = true;
+  $libxml_previous_state = libxml_use_internal_errors(true);
+  $doc->loadHTML($article_content);
+  libxml_clear_errors();
+  libxml_use_internal_errors($libxml_previous_state);
+  $articles = $doc->getElementsByTagName('article');
+  $article_only="";
+  if( isset($articles[0]) ) {
+    $article_only=DOMinnerHTML($articles[0]);
+  } else {
+    $article_only = "Extraction Failed for Subscribed Article";
+    WARNING($cpt." : ".$article_only);
+    break;
+  }
   $SEARCH_SUB2='<div id="bottom_ad"';
-  $pos_stop = strpos($article_content, $SEARCH_SUB2);
-  $article_only = substr($article_content, $pos_start, $pos_stop - $pos_start);
+  $pos_stop = strpos($article_only, $SEARCH_SUB2);
+  $article_only = substr($article_only, 0, $pos_stop);
 
   $orgStrings = array(' src="//www.numerama.com/');
   $newStrings = array(' src="https://www.numerama.com/');

...	...	@@ -61,12 +61,25 @@ foreach ($articles as $article ) {
61	61	echo "<hr>";
62	62	echo "<a name=\"article-$cpt\">";
63	63	$article_content = file_get_contents($article['link']);
64		- $SEARCH_SUB1='<article class="post-content span8 tablet12 phone12">';
65		- $pos_start = strpos($article_content, $SEARCH_SUB1);
66		- $pos_start += strlen($SEARCH_SUB1);
	64	+ $doc = new DOMDocument();
	65	+ $doc->preserveWhiteSpace = false;
	66	+ $doc->formatOutput = true;
	67	+ $libxml_previous_state = libxml_use_internal_errors(true);
	68	+ $doc->loadHTML($article_content);
	69	+ libxml_clear_errors();
	70	+ libxml_use_internal_errors($libxml_previous_state);
	71	+ $articles = $doc->getElementsByTagName('article');
	72	+ $article_only="";
	73	+ if( isset($articles[0]) ) {
	74	+ $article_only=DOMinnerHTML($articles[0]);
	75	+ } else {
	76	+ $article_only = "Extraction Failed for Subscribed Article";
	77	+ WARNING($cpt." : ".$article_only);
	78	+ break;
	79	+ }
67	80	$SEARCH_SUB2='<div id="bottom_ad"';
68		- $pos_stop = strpos($article_content, $SEARCH_SUB2);
69		- $article_only = substr($article_content, $pos_start, $pos_stop - $pos_start);
	81	+ $pos_stop = strpos($article_only, $SEARCH_SUB2);
	82	+ $article_only = substr($article_only, 0, $pos_stop);
70	83
71	84	$orgStrings = array(' src="//www.numerama.com/');
72	85	$newStrings = array(' src="https://www.numerama.com/');