channel->title; $channel['link'] = $xml->channel->link; $channel['description'] = $xml->channel->description; $channel['pubDate'] = $xml->channel->pubDate; $channel['timestamp'] = strtotime($xml->channel->pubDate); echo '

' . $channel['title'].PHP_EOL; echo ''.PHP_EOL; echo ''.PHP_EOL; echo ''.PHP_EOL; echo '

'.PHP_EOL; $cpt=0; foreach ($xml->channel->item as $item) { $article = array(); $article['title'] = $item->title; $article['link'] = $item->link; $orgStrings = array('?xtor=RSS-3208'); $newStrings = array(''); $article['link'] = str_replace($orgStrings, $newStrings, $article['link']); $article['pubDate'] = $item->pubDate; $article['timestamp'] = strtotime($item->pubDate); $article['description'] = $item->description; $article['image'] = $item->enclosure['url']; $articles[$cpt] = $article; echo '
'.PHP_EOL; echo '

'.PHP_EOL; echo '  '.PHP_EOL; echo ' '; echo $article['title'].'  '.PHP_EOL; echo '
'.PHP_EOL; $cpt++; if( $cpt > $NEWS_RSS_MAX_ITEMS ) { break; } } } echo ''.PHP_EOL; echo '
'.PHP_EOL; echo '
'.PHP_EOL; echo '
'.PHP_EOL; $cpt=0; foreach ($articles as $article ) { $cpt_prev=$cpt-1; $cpt_next=$cpt+1; echo PHP_EOL.PHP_EOL.''.PHP_EOL; echo "
\n"; echo "
"; echo ""; $article_content = file_get_contents($article['link']); $doc = new DOMDocument(); $doc->preserveWhiteSpace = false; $doc->formatOutput = true; $libxml_previous_state = libxml_use_internal_errors(true); $doc->loadHTML($article_content); libxml_clear_errors(); libxml_use_internal_errors($libxml_previous_state); $articles = $doc->getElementsByTagName('article'); $article_only=""; if( isset($articles[0]) ) { $article_only=DOMinnerHTML($articles[0]); } else { $article_only = "Extraction Failed"; ERROR("article($cpt) : $article_only : ".$article['link']); } $figure=""; $SEARCH_SUB='édition abonné'; $pos_start=strpos($article_content, $SEARCH_SUB); if(!$pos_start) { //Second Test DEBUG("article($cpt) : Non Abonne 1"); $SEARCH_SUB='

Article réservé aux abonnés

'; $pos_start=strpos($article_content, $SEARCH_SUB); if(!$pos_start) { DEBUG("article($cpt) : Non Abonne 2"); } else { DEBUG("article($cpt) : Abonne 2"); } } if($pos_start) { try { $figures = $doc->getElementsByTagName('figure'); if( NULL === $figures[0] ) { DEBUG("article($cpt) : No Image"); } else { $figure = DOMinnerHTML($figures[0]); $re = '/(.+?)/'; preg_match($re, $figure, $array); if(count($array) >= 5 ) { $figure = '
'.$array[4].'
 ('.$array[5].')
'; } } } catch(Exception $e) { ERROR("article($cpt) : Exception".$e->getMessage()); } $article_abonne = str_replace("www.lemonde.fr", "abonnes.lemonde.fr", $article['link']); $article_content = file_get_contents($article_abonne); $doc = new DOMDocument(); $doc->preserveWhiteSpace = false; $doc->formatOutput = true; $libxml_previous_state = libxml_use_internal_errors(true); $doc->loadHTML($article_content); libxml_clear_errors(); libxml_use_internal_errors($libxml_previous_state); $articles = $doc->getElementsByTagName('article'); $article_only=""; if( isset($articles[0]) ) { $article_only=DOMinnerHTML($articles[0]); } else { $article_only = "Extraction Failed for Subscribed Article"; break; } } $orgStrings = array(' href="/'); $newStrings = array(' href="https://www.lemonde.fr/'); $article_only = str_replace($orgStrings, $newStrings, $article_only); //Remove Blank lines $temp = preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "\n", $article_only); $article_only = $temp; $temp = preg_replace('/\s\s+/', ' ', $article_only); $article_only = $temp; //Remove Social section $re = '/
  • <\/li>/s'; $temp = preg_replace($re, '', $article_only); $article_only = $temp; //Cleanup end-of-article extraction $re = '/
    (.+)commentaires <\/a> <\/section>/'; $temp = preg_replace($re, '', $article_only); $article_only = $temp; $re = '/
  • <\/li>/'; $temp = preg_replace($re, '', $article_only); $article_only = $temp; $re = '/

    Les plus lus<\/p>/'; $temp = preg_replace($re, '', $article_only); $article_only = $temp; $re = '/

  • Annonces automobiles<\/span> avec La Centrale<\/span> <\/div>/'; $temp = preg_replace($re, '', $article_only); $article_only = $temp; $re = '/
  • Dans la même rubrique<\/p>/'; $temp = preg_replace($re, '', $article_only); $article_only = $temp; $re = '/