1 contributor
#!/usr/bin/python3
from feedgen.feed import FeedGenerator
import argparse
import requests
import re
import json
from bs4 import BeautifulSoup
raw_page = None
raw_meta = None
cpt=0
output_html = f"page.{cpt}.html"
APPNAME="RadioFrance2RSS"
APPTTL=86400
meta_image=None
meta_title=None
meta_published_time=None
meta_modified_time=None
meta_url=None
meta_description=None
meta_author=None
url_scheme=None
url_fqdn=None
def download_page(url):
global cpt
try:
response = requests.get(url)
if response.status_code == 200:
output_html = f"page.{cpt}.html"
with open(output_html, 'wb') as file:
file.write(response.content)
return response.content.decode("utf-8")
print(f"Downloaded successfully and saved to {output_html}")
cpt+=1
else:
print(f"Failed to download. Status code: {response.status_code}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
return None
def meta_property(raw,key):
pattern=r'<meta property="'+key+'" content="(.*?)">'
matches = re.findall(pattern, raw)
if matches:
return matches[0]
else:
return None
def meta_name(raw,key):
pattern=r'<meta name="'+key+'" content="(.*?)">'
matches = re.findall(pattern, raw)
if matches:
return matches[0]
else:
return None
def meta_extract():
global raw_meta
global meta_image
global meta_title
global meta_published_time
global meta_modified_time
global meta_url
global meta_description
global meta_author
str_meta_start='<!-- HEAD'
index = raw_page.find(str_meta_start)
occurrence_count = 0
occurrences_to_find = 2
index_start = index
index_stop = -2
while index != -1 and occurrence_count < occurrences_to_find:
index = raw_page.find(str_meta_start, index + 1)
if -2 == index_start:
index_start=index
elif -2 == index_stop:
index_stop=index
occurrence_count += 1
if 2 == occurrence_count:
raw_meta = raw_page[index_start:index_stop]
meta_image=meta_property(raw_meta,"og:image")
meta_title=meta_property(raw_meta,"og:title")
meta_title=meta_title.replace(" : podcast et émission en replay","")
meta_title=meta_title.replace(" : écouter le podcast et replay de franceinfo","")
meta_title=meta_title.replace(" en replay et podcast sur France Culture","")
meta_title=meta_title.replace(" - Écouter en replay et podcast sur France Inter","")
meta_title=meta_title.replace(" : un podcast à écouter en ligne","")
meta_title=meta_title.replace(" : écouter le podcast et replay de France Inter","")
#meta_title=meta_title.replace("","")
meta_url=meta_property(raw_meta,"og:url")
meta_published_time=meta_property(raw_meta,"article:published_time")
meta_modified_time=meta_property(raw_meta,"article:modified_time")
meta_description=meta_property(raw_meta,"og:description")
try:
meta_author=meta_title.split(" | ")[1]
except:
meta_author=meta_name(raw_meta,"twitter:site")
meta_author=meta_author[1:]
meta_author=meta_author.replace("franceinter","France_Inter")
meta_author=meta_author.replace("franceinfo","France_Info")
meta_author=meta_author.replace("franceculture","France_Culture")
meta_title=meta_title.split(" | ")[0]
def create_rss_feed():
fg = FeedGenerator()
fg.load_extension("podcast", rss=True)
fg.id(meta_url)
fg.title(meta_title)
fg.author({'name': meta_author})
fg.link(href=meta_url, rel='alternate')
fg.description(meta_description)
fg.docs("")
fg.ttl(APPTTL)
fg.generator(APPNAME)
fg.image(meta_image)
fg.podcast.itunes_author(meta_author)
fg.podcast.itunes_image(meta_image)
fg.podcast.itunes_owner(name=meta_author, email=meta_author)
soup = BeautifulSoup(raw_page, 'html.parser')
cards = soup.find_all('div',class_="CardTitle")
for card in cards:
soup2 = BeautifulSoup(str(card), 'html.parser')
links = soup2.find_all('a', href=True)
for link in links:
link_content=""
fe = fg.add_entry()
full_link=f"{url_scheme}//{url_fqdn}{link['href']}"
fe.id(full_link)
link_content = download_page(full_link)
soup2 = BeautifulSoup(link_content, 'html.parser')
json2 = soup2.find_all('script')
json_data_str=json2[1].text.replace("@","")
json_data = json.loads(json_data_str)
element_title=json_data['graph'][0]['name']
element_description=json_data['graph'][0]['description']
element_dateCreated=json_data['graph'][0]['dateCreated']
try:
element_contentUrl=json_data['graph'][0]['mainEntity']['contentUrl']
except:
print(json_data_str)
item_title=link.text
fe.id=full_link
fe.title(item_title)
fe.link(href=full_link, rel='alternate')
fe.description(element_description)
fe.pubDate(element_dateCreated)
fe.enclosure(url=element_contentUrl, type="audio/mpeg")
print(f" {item_title}")
# Generate the XML feed
rss_feed = fg.rss_str(pretty=True)
outputFilameRaw=f"{meta_author}-{meta_title}.xml"
outputFilame=outputFilameRaw.replace(" ","_").replace(":","_")
print(f"Output: {outputFilame}")
with open(outputFilame, 'w') as f:
f.write(rss_feed.decode('utf-8'))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Radiofrance URL to RSS')
parser.add_argument('-u', '--url', type=str, required=True, help='radiofrance url')
args = parser.parse_args()
url=args.url
url_scheme=url.split("/")[0]
url_fqdn=url.split("/")[2]
raw_page = download_page(url)
meta_extract()
create_rss_feed()
print("----------")