#!/usr/bin/env python # $LastChangedDate: 2006-08-01 18:03:21 +0200 (Tue, 01 Aug 2006) $ # $Rev: 23 $ # $Author: paul $ """ SYNOPSIS tv_grab_nl_py is a python script that trawls tvgids.nl for TV programming information and outputs it in XMLTV-formatted output (see http://membled.com/work/apps/xmltv). Users of MythTV (http://www.mythtv.org) will appreciate the output generated by this grabber, because it fills the category fields, i.e. colors in the EPG, and has logos for most channels automagically available. Check the website below for screenshots. The newest version of this script can be found here: http://visualisation.tudelft.nl/~paul/grabber USAGE Check the web site above and/or run script with --help and start from there HISTORY tv_grab_nl_py used to be called tv_grab_nl_pdb, first released on 2003/07/09. The name change was necessary because more and more people are actively contributing to this script and I always disliked using my initials (I was just too lazy to change it). At the same time I switched from using CVS to SVN and as a result the version numbering scheme has changed. The lastest official release of tv_grab_nl_pdb is 0.48. The first official release of tv_grab_nl_py is 6. QUESTIONS Questions (and patches) are welcome at: paul at pwdebruin dot net. IMPORTANT NOTES If you were using tv_grab_nl from the XMLTV bundle then enable the compat flag or use the --compat command-line option. Otherwise, the xmltvid's are wrong and you will not see any new data in MythTV. CONTRIBUTORS Main author: Paul de Bruin (paul at pwdebruin dot net) Michel van der Laan made available his extensive collection of high-quality logos that is used by this script. Michael Heus has taken the effort to further enhance this script so that it now also includes: - Credit info: directors, actors, presenters and writers - removal of programs that are actually just groupings/broadcasters (e.g. "KETNET", "Wild Friday", "Z@pp") - Star-rating for programs tipped by tvgids.nl - Black&White, Stereo and URL info - Better detection of Movies - and more... Several other people have provided feedback and patches (these are the people I could find in my email archive, if you are missing from this list let me know): Huub Bouma, Roy van der Kuil, Remco Rotteveel, Mark Wormgoor, Dennis van Onselen, Hugo van der Kooij """ # Modules we need import re, urllib, getopt, sys import time, random import htmlentitydefs, os, os.path, pickle from string import replace, split, strip from threading import Thread # do extra debug stuff debug = 1 try: import redirect except: debug = 0 pass # globals tvgids = 'http://www.tvgids.nl/' uitgebreid_zoeken = tvgids + 'zoeken' # how many seconds to wait before we timeout on a # url fetch, 10 seconds seems reasonable global_timeout = 10 # Wait a random number of seconds between each page fetch. # We want to be nice and not hammer tvgids.nl (these are the # friendly people that provide our data...). # Also, it appears tvgids.nl throttles its output. # So there, there is not point in lowering these numbers, if you # are in a hurry, use the (default) fast mode. nice_time = [1, 3] # Maximum number of characters to use for program description. # This is a MythTV-specific setting. desc_len = 475 # Maximum length in minutes of overlapping programming to correct max_overlap = 10 # Create a category translation dictionary # Look in mythtv/themes/blue/ui.xml for all category names # The keys are the categories used by tvgids.nl (lowercase please) cattrans = { 'amusement' : 'Talk', 'animatie' : 'Animated', 'comedy' : 'Comedy', 'documentaire' : 'Documentary', 'educatief' : 'Educational', 'erotiek' : 'Adult', 'film' : 'Movies', 'muziek' : 'Music', 'informatief' : 'Educational', 'jeugd' : 'Kids', 'kunst/cultuur' : 'arts/culture', 'misdaad' : 'Crime/Mystery', 'muziek' : 'Music', 'natuur' : 'Nature', 'nieuws/actualiteiten' : 'News', 'overige' : 'Unknown', 'religieus' : 'Religion', 'serie/soap' : 'SERIE', 'sport' : 'Sports', 'theater' : 'music/ballet/dance', 'wetenschap' : 'Science/Nature'} # Create a role translation dictionary for the xmltv credits part # The keys are the roles used by tvgids.nl (lowercase please) roletrans = {'regie' : 'director', 'acteurs' : 'actor', 'presentatie' : 'presenter', 'scenario' : 'writer'} # We have two sources of logos, the first provides the nice ones, but is not # complete. We use the tvgids logos to fill the missing bits. logo_provider = [ 'http://visualisation.tudelft.nl/~paul/logos/gif/64x64/', 'http://static.tvgids.nl/gfx/zenders/' ] logo_names = { 1 : [0, 'ned1'], 2 : [0, 'ned2'], 3 : [0, 'ned3'], 4 : [0, 'rtl4'], 5 : [0, 'een'], 6 : [0, 'canvas_color'], 7 : [0, 'bbc1'], 8 : [0, 'bbc2'], 9 : [0,'ard'], 10 : [0,'zdf'], 11 : [1, 'rtl'], 12 : [0, 'wdr'], 13 : [1, 'ndr'], 14 : [1, 'srsudwest'], 15 : [1, 'rtbf1'], 16 : [1, 'rtbf2'], 17 : [0, 'tv5'], 18 : [0, 'ngc'], 19 : [1, 'eurosport'], 20 : [1, 'tcm'], 24 : [0, 'canal+red'], 25 : [0, 'mtv-color'], 26 : [0, 'cnn'], 27 : [0, 'rai'], 28 : [1, 'sat1'], 29 : [0, 'discover-spacey'], 31 : [0, 'rtl5'], 32 : [1, 'trt'], 34 : [0, 'veronica'], 35 : [0, 'tmf'], 36 : [0, 'sbs6'], 37 : [0, 'net5'], 38 : [1, 'arte'], 39 : [0, 'canal+blue'], 40 : [0, 'at5'], 46 : [0, 'rtl7'], 49 : [1, 'vtm'], 50 : [1, '3sat'], 58 : [1, 'pro7'], 59 : [1, 'kanaal2'], 60 : [1, 'vt4'], 65 : [0, 'animal-planet'], 73 : [1, 'mezzo'], 86 : [0, 'bbc-world'], 87 : [1, 'tve'], 89 : [1, 'nick'], 90 : [1, 'bvn'], 92 : [0, 'talpa-mono'], 100 : [1, 'tvutrecht'], 101 : [1, 'tvwest'], 102 : [1, 'tvrijnmond'], 103 : [1, 'tvnoordholland'], 105 : [1, 'spice'], 107 : [0, 'canal+yellow']} # Work in progress, the idea is to cache program categories and # descriptions to eliminate a lot of page fetches from tvgids.nl # for programs that do not have interesting/changing descriptions class ProgramCache: """ A cache to hold program name and category info. TVgids stores the detail for each program on a separate URL with an (apparently unique) ID. This cache stores the fetched info with the ID. New fetches will use the cached info instead of doing an (expensive) page fetch. """ def __init__(self, filename=None): """ Create a new ProgramCache object, optionally from file """ # where we store our info self.filename = filename if filename == None: self.pdict = {} else: if os.path.isfile(filename): self.load(filename) else: self.pdict = {} def load(self, filename): """ Loads a pickled cache dict from file """ self.pdict = pickle.load(open(filename,'r')) def dump(self, filename): """ Dumps a pickled cache """ pickle.dump(self.pdict, open(filename, 'w')) def query(self, program_id): """ Updates/gets/whatever. """ try: return self.pdict[program_id] except: return None def add(self, program): """ Adds a program """ self.pdict[program['ID']] = program def clear(self): """ Clears the cache (i.e. empties it) """ self.pdict = {} def clean(self): """ Removes all cached programming before today. Also removes erroneously cached programming. """ now = int(time.strftime('%Y%m%d')) for key in self.pdict.keys(): if int(self.pdict[key]['stop'][0:8]) < now: del self.pdict[key] # normally, these entries should not be present in the # cache, but people may have caches filled with these # entries before they upgraded to a newer version of this # script elif self.pdict[key]['name'].lower() == 'onbekend': del self.pdict[key] def usage(): print 'tv_grab_nl_py: A grabber that grabs tvguide data from tvgids.nl\n' print 'and stores it in XMLTV-combatible format.\n' print 'Usage:' print '--help, -h = print this info' print '--configure = create configfile (overwrites existing file)' print '--config-file = name of the configuration file (default = ~/.xmltv/tv_grab_py.conf' print '--output = file where to put the output' print '--days = # number of days to grab' print '--offset = # day offset from where to grab (0 is today, 1 tomorow, etc)' print '--slow = also grab descriptions of programming' print '--quiet = suppress all output' print '----' print '--compat = append tvgids.nl to the xmltv id (use this if you were using tv_grab_nl)' print '--logos = insert urls to channel icons (mythfilldatabase will then use these)' print '--nocattrans = do not translate the grabbed genres into MythTV-genres' print '--cache = cache descriptions and use the file to store' print '--clean_cache = clean the cache file before fetching' print '--clear_cache = empties the cache file before fetching data' print '--slowdays = grab slowdays initial days and the rest in fast mode' def filter_line_identity(m, defs=htmlentitydefs.entitydefs): # callback: translate one entity to its ISO Latin value k = m.group(1) if k.startswith("#"): return chr(int(k[1:])) try: return defs[k] except KeyError: return m.group(0) # use as is def filter_line(s): """ Removes unwanted stuff in strings (adapted from tv_grab_be) """ # do the latin1 stuff pattern = re.compile("&(\S+?);") s = pattern.sub(filter_line_identity, s) s = replace(s,' ',' ') s = replace(s,'\r',' ') x = re.compile('(<.*?>)') s = x.sub('', s) # A couple of characters which are not legal in Latin-1, we have # to guess what they are. # s = replace(s, '~Q', "'") s = replace(s, '~R', "'") # Hmm, not sure if I understand this. Without it, mythfilldatabase barfs # on program names like "Steinbrecher &..." s = replace(s,'&','&') return s def calc_timezone(t): """ Takes a time from tvgids.nl and formats it with all the required timezone conversions. in: '20050429075000' out:'20050429075000 (CET|CEST)' Until I have figured out how to correctly do timezoning in python this method will bork if you are not in a zone that has the same DST rules as 'Europe/Amsterdam'. """ year = int(t[0:4]) month = int(t[4:6]) day = int(t[6:8]) hour = int(t[8:10]) minute = int(t[10:12]) pt = time.mktime((year,month,day,hour,minute,0,0,0,-1)) timezone='' try: timezone = time.tzname[(time.localtime(pt))[-1]] except: sys.stderr.write('Cannot convert time to timezone') return t+' %s' % timezone def duration(t1,t2): """ Calculates the duration of a program (24h times) in minutes. [h2,m2] can be on the next day. duration(23,10,23,15) -> 5 duration(23,10,0,20) -> 70 """ h1 = int(t1[0:2]) m1 = int(t1[3:5]) h2 = int(t2[0:2]) m2 = int(t2[3:5]) if h2 '11:30' """ h = int(t1[0:2]) m = int(t1[3:5])+minutes nh,nm = divmod(h*60+m,60) return '%02d:%02d' % (nh % 24,nm) def get_page_internal(url, quiet=0): """ Retrieves the url and returns a string with the contents. Optionally, returns None if processing takes longer than the specified number of timeout seconds. """ try: fp = urllib.urlopen(url) lines = fp.readlines() page = "".join(lines) return page except: if not quiet: sys.stderr.write('Cannot open url: %s\n' % url) return None class FetchURL(Thread): """ A simple thread to fetch a url with a timeout """ def __init__ (self, url, quiet=0): Thread.__init__(self) self.quiet = quiet self.url = url self.result = None def run(self): self.result = get_page_internal(self.url, self.quiet) def get_page(url, quiet=0): """ Wrapper around get_page_internal to catch the timeout exception """ try: fu = FetchURL(url, quiet) fu.start() fu.join(global_timeout) return fu.result except: if not quiet: sys.stderr.write('get_page timed out on (>%s s): %s\n' % (global_timeout, url)) return None def get_channels(file, quiet=0): """ Get a list of all available channels and store these in a file. """ # store channels in a dict channels = {} # tvgids stores several instances of channels, we want to # find all the possibile channels channel_get = re.compile('(.*?)', re.DOTALL) # this is how we will find a (number, channel) instance channel_re = re.compile('', re.DOTALL) # this is where we will try to find our channel list total = get_page(uitgebreid_zoeken, quiet) if total == None: return # get a list of match objects of all the