1 contributor
from userio import *
import requests
import re
import newsParser
def article(url):
say("Article: "+url)
url = url.replace("www.midilibre.fr","www.midilibre.fr/amp")
r = requests.get(url, allow_redirects=True)
content = r.text
articleStrImageUrl = newsParser.articleImage(content)
articleStrTitle = newsParser.articleTitle(content)
articleStrDescription = newsParser.articleDescription(content)
pageContent = ""
pageContent += "<meta property=\"og:type\" content=\"article\">\n"
pageContent += "<meta property=\"og:title\" content=\""+articleStrTitle+"\">\n"
pageContent += "<meta property=\"og:description\" content=\""+articleStrDescription+"\">\n"
pageContent += "<meta property=\"og:url\" content=\""+url+"\">\n"
pageContent += "<meta property=\"og:image\" content=\""+articleStrImageUrl+"\">\n"
pageContent += "<meta property=\"og:image:type\" content=\"image/jpeg\">"
pageContent += "<h2>"+articleStrTitle+"</h2>\n"
pageContent += "<img src=\""+articleStrImageUrl+"\">\n"
articleCstBegin = "<article "
# ~ articleCstEnd = "</article>"
articleCstEnd = "<div class=\"article-full__footer\">"
articleCstEnd2 = "<section subscriptions-section=\"content-not-granted\">"
articleCstEnd3 = "</article>"
indexBegin = content.index(articleCstBegin)
try:
indexEnd = content.index(articleCstEnd)
except:
try:
indexEnd = content.index(articleCstEnd2)
except:
indexEnd = content.index(articleCstEnd3)
article_only = content[indexBegin:indexEnd]
article_only = re.sub(r"<amp-img", '<img', article_only)
article_only = re.sub(r"</amp-img>", '', article_only)
article_only = re.sub(r"<h2", '<h3', article_only)
article_only = re.sub(r"</h2>", '</h3>', article_only)
article_only = re.sub(r"<h1", '<h2', article_only)
article_only = re.sub(r"</h1>", '</h2>', article_only)
article_only = re.sub(r"<path class=\"icon-base-fill\"(.*?)</path>",'', article_only,re.MULTILINE)
article_only = re.sub(r"<path class=\"icon-base-fill\"(.*?)\"/>",'', article_only)
article_only = re.sub(r"onclick=\"window.open\("https:(.*?)\);\">",'', article_only)
article_only = re.sub(r"<span class=\"text\">S'abonner</span>",'', article_only)
article_only = re.sub(r"<div id=\"pub_dfp_inread1\" class=\"pub pub_dfp pub_dfp_inread1 upto-tablet base-margin-bottom pub_with_light_background\"></div>",'', article_only,re.MULTILINE)
# ~ article_only = re.sub(r"<svg class=\"icon-share\"(.*?)</svg>",'', article_only,re.MULTILINE)
article_only = re.sub(r"href=\"mailto:\?subject=(.*?)\"",'', article_only,re.MULTILINE)
article_only = re.sub(r"<svg class=\"(.*?)\" viewBox=\"0 0 (.*?) (.*?)\">",'<svg>', article_only)
article_only = re.sub(r"<svg viewBox=\"0 0 (.*?) (.*?)\">",'<svg>', article_only)
article_only = re.sub(r"class=\"btn btn-icon btn-icon btn-disc btn-pill btn-outline\"",'', article_only)
article_only = re.sub(r"class=\"btn btn-icon btn-icon btn-disc btn-pill btn-outline upto-tablet\"",'', article_only)
article_only = re.sub(r"<button",'<button style="display:none;">', article_only)
article_only = re.sub(r"<aside class=\"social-links",'<aside style="display:none;" class="social-links', article_only)
article_only = re.sub(r"onclick=\"if\(navigator\.share\) (.*?)return false;\" >",'', article_only)
# ~ article_only = article_only.replace("><", ">\n<")
article_only = re.sub(r"href=\"\/", 'href=\"//www.sudouest.fr/', article_only)
pageContent += "<article>"+article_only+"</article>"
return pageContent