radiofrance2rss / radiofrance2rss.py /
31ee495 a year ago
1 contributor
192 lines | 6.022kb
#!/usr/bin/python3
from feedgen.feed import FeedGenerator
import argparse
import requests
import re
import json
from bs4 import BeautifulSoup

raw_page = None
raw_meta = None
cpt=0
output_html = f"page.{cpt}.html"


APPNAME="RadioFrance2RSS"
APPTTL=86400
meta_image=None
meta_title=None
meta_published_time=None
meta_modified_time=None
meta_url=None
meta_description=None
meta_author=None

url_scheme=None
url_fqdn=None

def download_page(url):
    global cpt
    try:
        response = requests.get(url)
        if response.status_code == 200:
            output_html = f"page.{cpt}.html"
            with open(output_html, 'wb') as file:
                file.write(response.content)
            return response.content.decode("utf-8")
            print(f"Downloaded successfully and saved to {output_html}")
            cpt+=1
        else:
            print(f"Failed to download. Status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    return None
 

def meta_property(raw,key):
  pattern=r'<meta property="'+key+'" content="(.*?)">'
  matches = re.findall(pattern, raw)
  if matches:
    return matches[0]
  else:
    return None
def meta_name(raw,key):
  pattern=r'<meta name="'+key+'" content="(.*?)">'
  matches = re.findall(pattern, raw)
  if matches:
    return matches[0]
  else:
    return None
  
    
  
def meta_extract():
  global raw_meta
  global meta_image
  global meta_title
  global meta_published_time
  global meta_modified_time
  global meta_url
  global meta_description
  global meta_author
  str_meta_start='<!-- HEAD'
  
  index = raw_page.find(str_meta_start)
  occurrence_count = 0
  occurrences_to_find = 2

  index_start = index
  index_stop = -2
  while index != -1 and occurrence_count < occurrences_to_find:
    index = raw_page.find(str_meta_start, index + 1)
    if -2 == index_start:
      index_start=index
    elif -2 == index_stop:
      index_stop=index
    occurrence_count += 1

  if 2 == occurrence_count:
    raw_meta = raw_page[index_start:index_stop]
    meta_image=meta_property(raw_meta,"og:image")
    meta_title=meta_property(raw_meta,"og:title")
    meta_title=meta_title.replace(" : podcast et émission en replay","")
    meta_title=meta_title.replace(" : écouter le podcast et replay de franceinfo","")
    meta_title=meta_title.replace(" en replay et podcast sur France Culture","")
    meta_title=meta_title.replace(" - Écouter en replay et podcast sur France Inter","")
    meta_title=meta_title.replace(" : un podcast à écouter en ligne","")
    meta_title=meta_title.replace(" : écouter le podcast et replay de France Inter","")
    #meta_title=meta_title.replace("","")
    meta_url=meta_property(raw_meta,"og:url")
    meta_published_time=meta_property(raw_meta,"article:published_time")
    meta_modified_time=meta_property(raw_meta,"article:modified_time")
    meta_description=meta_property(raw_meta,"og:description")
    try:
      meta_author=meta_title.split(" | ")[1]
    except:
      meta_author=meta_name(raw_meta,"twitter:site")
      meta_author=meta_author[1:]
    meta_author=meta_author.replace("franceinter","France_Inter")
    meta_author=meta_author.replace("franceinfo","France_Info")
    meta_author=meta_author.replace("franceculture","France_Culture")
          
    meta_title=meta_title.split(" | ")[0]
  
def create_rss_feed():
    fg = FeedGenerator()
    fg.load_extension("podcast", rss=True)

    fg.id(meta_url)
    fg.title(meta_title)
    fg.author({'name': meta_author})
    fg.link(href=meta_url, rel='alternate')
    fg.description(meta_description)
    fg.docs("")
    fg.ttl(APPTTL)
    fg.generator(APPNAME)
    fg.image(meta_image)
    
    fg.podcast.itunes_author(meta_author)
    fg.podcast.itunes_image(meta_image)
    fg.podcast.itunes_owner(name=meta_author, email=meta_author)
    
    soup = BeautifulSoup(raw_page, 'html.parser')
    cards = soup.find_all('div',class_="CardTitle")
    for card in cards:
        soup2 = BeautifulSoup(str(card), 'html.parser')
        links = soup2.find_all('a', href=True)
        
        for link in links:
            link_content=""
            fe = fg.add_entry()
            full_link=f"{url_scheme}//{url_fqdn}{link['href']}"
            fe.id(full_link)
            
            link_content = download_page(full_link)
            soup2 = BeautifulSoup(link_content, 'html.parser')
            json2 = soup2.find_all('script')
            json_data_str=json2[1].text.replace("@","")
            json_data = json.loads(json_data_str)
            element_title=json_data['graph'][0]['name']
            element_description=json_data['graph'][0]['description']
            element_dateCreated=json_data['graph'][0]['dateCreated']
            try:
                element_contentUrl=json_data['graph'][0]['mainEntity']['contentUrl']
            except:
                print(json_data_str)


                      
            item_title=link.text
            fe.id=full_link
            fe.title(item_title)
            fe.link(href=full_link, rel='alternate')
            fe.description(element_description)
            fe.pubDate(element_dateCreated)
            fe.enclosure(url=element_contentUrl, type="audio/mpeg")
            print(f"  {item_title}")
                
    # Generate the XML feed
    rss_feed = fg.rss_str(pretty=True)

    outputFilameRaw=f"{meta_author}-{meta_title}.xml"
    outputFilame=outputFilameRaw.replace(" ","_").replace(":","_")
    print(f"Output: {outputFilame}")
    with open(outputFilame, 'w') as f:
        f.write(rss_feed.decode('utf-8'))

if __name__ == "__main__":
  parser = argparse.ArgumentParser(description='Radiofrance URL to RSS')
  parser.add_argument('-u', '--url', type=str, required=True, help='radiofrance url')
  args = parser.parse_args()
  url=args.url
  url_scheme=url.split("/")[0]
  url_fqdn=url.split("/")[2]

  raw_page = download_page(url)
  meta_extract()
  create_rss_feed()
  print("----------")