This is a script that can be used to download all media files from a website.
If you give the script an url, it will retrieve the page and scan it for links. You can pass a regular expression to define what links should be found. The default expression is [^"]+\.(mp4|webm|avi|mp3|aac|wav|flac|png|jpg|gif), but using one of the command line options that can be altered to search for different filetypes or for instance to only find links to a specific site.
There are options to define if you only want to print the links or if you want to download them.
Some other options allow you to pass a cookie when requesting the website, which may be useful if you need to login to a website, you can copy the cookie from your browser. And another allows you to change the user-agent string so the website you are looking up won't know you are using a script instead of a browser.
#!/usr/bin/env python3 # # Copyright 2015 Hans Alves <halves@localhost> # This work is free. You can redistribute it and/or modify it under the # terms of the Do What The Fuck You Want To Public License, Version 2, # as published by Sam Hocevar. See http://www.wtfpl.net/ for more details. # import argparse import os.path import re import sys import time import urllib.request import urllib.parse searchexp = r'(?:<a\s[^>]*href="(?P<ahref>(?:https?://)?{exp})")' found = set() action = '' def check(url): parts = urllib.parse.urlparse(url) s = os.path.join(parts.netloc.strip('/'), parts.path.strip('/')) if s in found: return False else: found.add(s) return True def ask_name(name): global action print("""A file with the name `{}' already exists. What do you want to do? These are your options: (o)verwrite the file, (i)gnore the file, (e)nter a new filename If none of these options is given the file will be overwritten ? """.format(name), end='', file=sys.stderr) if action != '' and action in 'oie': answer = action print(action, file=sys.stderr) else: answer = input().strip().lower() while answer and answer not in 'oie': print('invalid option', file=sys.stderr) answer = input().strip().lower() if action == '': print('Do this every time from now on? (y/N) ', end='', file=sys.stderr) answer2 = input().strip().lower() while answer2 and answer2 not in 'yn': print('invalid option', file=sys.stderr) answer2 = input().strip().lower() action = answer if answer2 == 'y' else answer2 if answer == 'e': print('Please enter a new filename: ', file=sys.stderr) return input().strip() elif answer == 'o': return name else: return None def download(url, headers, args): if args.verbose: print('found link', url) name = os.path.basename(url) if os.path.exists(name): name = ask_name(name) if name is None: return req = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(req) as f, open(name, 'wb') as out: if args.verbose: print('downloading', name) out.write(f.read()) def getlink(url, regex, args): if args.verbose: print('getting url', url) origparts = urllib.parse.urlparse(url) if not origparts.scheme or not origparts.netloc: print('Invalid url', url, file=sys.stderr) return headers = {} if args.user_agent: if args.verbose: print('setting User-Agent header to', args.user_agent) headers['User-Agent'] = args.user_agent if args.cookie: if args.verbose: print('adding cookie', args.cookie) headers['Cookie'] = args.cookie req = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(req) as f: if args.verbose: print('reading date') data = f.read().decode() if args.verbose: print('searching for links') for match in regex.finditer(data): link = match.group('ahref') parts = urllib.parse.urlparse(link) if not parts.netloc: link = '{scheme}://{netloc}/{path}'.format( scheme=origparts.scheme, netloc=origparts.netloc, path=link) elif not parts.scheme: link = '{scheme}://{link}'.format( scheme=origparts.scheme, link=link) link = re.sub(r'(https?:)//+', r'\1//', link) if check(link): print(link) if args.get: download(link, headers, args) def main(): parser = argparse.ArgumentParser(description='getlink.py') expgroup = parser.add_mutually_exclusive_group(required=False) expgroup.add_argument('-a', '--all', action='store_true', help='Get all links') expgroup.add_argument('-t', '--filetype', metavar='FILETYPE', help='Get links of filetype') expgroup.add_argument('-e', '--regex', metavar='EXPRESSION', help='Get links that match regex') actgroup = parser.add_mutually_exclusive_group(required=False) actgroup.add_argument('-p', '--print', dest='get', action='store_false', help='Print links') actgroup.add_argument('-g', '--get', action='store_true', help='Download links (default)', default=True) parser.add_argument('-c', '--cookie', metavar='COOKIE', help='add cookie to the request') parser.add_argument('-u', '--user-agent', metavar='AGENT', help='Spoof the user agent') parser.add_argument('-v', '--verbose', action='store_true', help='be verbose') parser.add_argument('-f', '--file', help='read urls to get from file') parser.add_argument('-w', '--wait', metavar='SECONDS', help='wait between the downloading of links, servers ' 'might like this', type=float, default=1) parser.add_argument('link', nargs='*', help='url of the page to search for links') args = parser.parse_args() exp = r'[^"]+\.(mp4|webm|avi|mp3|aac|wav|flac|png|jpg|gif)' if args.all: exp = r'[^"]+' elif args.filetype is not None: exp = r'[^"]+\.{}'.format(args.filetype) elif args.regex: exp = args.regex if args.verbose: print('using expression', searchexp.format(exp=exp)) regex = re.compile(searchexp.format(exp=exp)) if not args.file and not args.link: parser.error("Need an url to get") if args.file: if args.verbose: print('reading files from', args.file) with open(args.file, 'r') as f: for url in f: url = url.strip() getlink(url, regex, args) if args.wait: time.sleep(args.wait) for url in args.link: getlink(url, regex, args) if args.wait: time.sleep(args.wait) if __name__ == '__main__': main()