qzwx.nl | getlink

This is a script that can be used to download all media files from a website.

If you give the script an url, it will retrieve the page and scan it for links. You can pass a regular expression to define what links should be found. The default expression is [^"]+\.(mp4|webm|avi|mp3|aac|wav|flac|png|jpg|gif), but using one of the command line options that can be altered to search for different filetypes or for instance to only find links to a specific site.

There are options to define if you only want to print the links or if you want to download them.

Some other options allow you to pass a cookie when requesting the website, which may be useful if you need to login to a website, you can copy the cookie from your browser. And another allows you to change the user-agent string so the website you are looking up won't know you are using a script instead of a browser.

#!/usr/bin/env python3
#
#  Copyright 2015 Hans Alves <halves@localhost>
#  This work is free. You can redistribute it and/or modify it under the
#  terms of the Do What The Fuck You Want To Public License, Version 2,
#  as published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
#

import argparse
import os.path
import re
import sys
import time
import urllib.request
import urllib.parse

searchexp = r'(?:<a\s[^>]*href="(?P<ahref>(?:https?://)?{exp})")'

found = set()
action = ''

def check(url):
    parts = urllib.parse.urlparse(url)
    s = os.path.join(parts.netloc.strip('/'), parts.path.strip('/'))
    if s in found:
        return False
    else:
        found.add(s)
        return True

def ask_name(name):
    global action
    print("""A file with the name `{}' already exists.
What do you want to do? These are your options:
(o)verwrite the file,
(i)gnore the file,
(e)nter a new filename
If none of these options is given the file will be overwritten
? """.format(name), end='', file=sys.stderr)
    if action != '' and action in 'oie':
        answer = action
        print(action, file=sys.stderr)
    else:
        answer = input().strip().lower()
        while answer and answer not in 'oie':
            print('invalid option', file=sys.stderr)
            answer = input().strip().lower()

    if action == '':
        print('Do this every time from now on? (y/N) ', end='', file=sys.stderr)
        answer2 = input().strip().lower()
        while answer2 and answer2 not in 'yn':
            print('invalid option', file=sys.stderr)
            answer2 = input().strip().lower()
        action = answer if answer2 == 'y' else answer2

    if answer == 'e':
        print('Please enter a new filename: ', file=sys.stderr)
        return input().strip()
    elif answer == 'o':
        return name
    else:
        return None


def download(url, headers, args):
    if args.verbose:
        print('found link', url)
    name = os.path.basename(url)
    if os.path.exists(name):
        name = ask_name(name)
        if name is None:
            return

    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as f, open(name, 'wb') as out:
        if args.verbose:
            print('downloading', name)
        out.write(f.read())

def getlink(url, regex, args):
    if args.verbose:
        print('getting url', url)
    origparts = urllib.parse.urlparse(url)

    if not origparts.scheme or not origparts.netloc:
        print('Invalid url', url, file=sys.stderr)
        return

    headers = {}
    if args.user_agent:
        if args.verbose:
            print('setting User-Agent header to', args.user_agent)
        headers['User-Agent'] = args.user_agent
    if args.cookie:
        if args.verbose:
            print('adding cookie', args.cookie)
        headers['Cookie'] = args.cookie


    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as f:
        if args.verbose:
            print('reading date')
        data = f.read().decode()
        if args.verbose:
            print('searching for links')
        for match in regex.finditer(data):
            link = match.group('ahref')
            parts = urllib.parse.urlparse(link)
            if not parts.netloc:
                link = '{scheme}://{netloc}/{path}'.format(
                            scheme=origparts.scheme,
                            netloc=origparts.netloc,
                            path=link)
            elif not parts.scheme:
                link = '{scheme}://{link}'.format(
                            scheme=origparts.scheme,
                            link=link)
            link = re.sub(r'(https?:)//+', r'\1//', link)
            if check(link):
                print(link)
                if args.get:
                    download(link, headers, args)

def main():
    parser = argparse.ArgumentParser(description='getlink.py')
    expgroup = parser.add_mutually_exclusive_group(required=False)
    expgroup.add_argument('-a', '--all', action='store_true',
                          help='Get all links')
    expgroup.add_argument('-t', '--filetype', metavar='FILETYPE',
                          help='Get links of filetype')
    expgroup.add_argument('-e', '--regex', metavar='EXPRESSION',
                          help='Get links that match regex')
    actgroup = parser.add_mutually_exclusive_group(required=False)
    actgroup.add_argument('-p', '--print', dest='get', action='store_false',
                          help='Print links')
    actgroup.add_argument('-g', '--get', action='store_true',
                          help='Download links (default)', default=True)
    parser.add_argument('-c', '--cookie', metavar='COOKIE',
                        help='add cookie to the request')
    parser.add_argument('-u', '--user-agent', metavar='AGENT',
                        help='Spoof the user agent')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='be verbose')
    parser.add_argument('-f', '--file', help='read urls to get from file')
    parser.add_argument('-w', '--wait', metavar='SECONDS',
                        help='wait between the downloading of links, servers '
                             'might like this', type=float, default=1)
    parser.add_argument('link', nargs='*',
                        help='url of the page to search for links')

    args = parser.parse_args()

    exp = r'[^"]+\.(mp4|webm|avi|mp3|aac|wav|flac|png|jpg|gif)'
    if args.all:
        exp = r'[^"]+'
    elif args.filetype is not None:
        exp = r'[^"]+\.{}'.format(args.filetype)
    elif args.regex:
        exp = args.regex

    if args.verbose:
        print('using expression', searchexp.format(exp=exp))

    regex = re.compile(searchexp.format(exp=exp))

    if not args.file and not args.link:
        parser.error("Need an url to get")

    if args.file:
        if args.verbose:
            print('reading files from', args.file)
        with open(args.file, 'r') as f:
            for url in f:
                url = url.strip()
                getlink(url, regex, args)
                if args.wait:
                    time.sleep(args.wait)

    for url in args.link:
        getlink(url, regex, args)
        if args.wait:
            time.sleep(args.wait)

if __name__ == '__main__':
    main()