...
|
...
|
@@ -61,12 +61,25 @@ foreach ($articles as $article ) {
|
61
|
61
|
echo "<hr>";
|
62
|
62
|
echo "<a name=\"article-$cpt\">";
|
63
|
63
|
$article_content = file_get_contents($article['link']);
|
64
|
|
- $SEARCH_SUB1='<article class="post-content span8 tablet12 phone12">';
|
65
|
|
- $pos_start = strpos($article_content, $SEARCH_SUB1);
|
66
|
|
- $pos_start += strlen($SEARCH_SUB1);
|
|
64
|
+ $doc = new DOMDocument();
|
|
65
|
+ $doc->preserveWhiteSpace = false;
|
|
66
|
+ $doc->formatOutput = true;
|
|
67
|
+ $libxml_previous_state = libxml_use_internal_errors(true);
|
|
68
|
+ $doc->loadHTML($article_content);
|
|
69
|
+ libxml_clear_errors();
|
|
70
|
+ libxml_use_internal_errors($libxml_previous_state);
|
|
71
|
+ $articles = $doc->getElementsByTagName('article');
|
|
72
|
+ $article_only="";
|
|
73
|
+ if( isset($articles[0]) ) {
|
|
74
|
+ $article_only=DOMinnerHTML($articles[0]);
|
|
75
|
+ } else {
|
|
76
|
+ $article_only = "Extraction Failed for Subscribed Article";
|
|
77
|
+ WARNING($cpt." : ".$article_only);
|
|
78
|
+ break;
|
|
79
|
+ }
|
67
|
80
|
$SEARCH_SUB2='<div id="bottom_ad"';
|
68
|
|
- $pos_stop = strpos($article_content, $SEARCH_SUB2);
|
69
|
|
- $article_only = substr($article_content, $pos_start, $pos_stop - $pos_start);
|
|
81
|
+ $pos_stop = strpos($article_only, $SEARCH_SUB2);
|
|
82
|
+ $article_only = substr($article_only, 0, $pos_stop);
|
70
|
83
|
|
71
|
84
|
$orgStrings = array(' src="//www.numerama.com/');
|
72
|
85
|
$newStrings = array(' src="https://www.numerama.com/');
|