# Program to print all web links on a web page
# It assumes the resource that the URL points to returns
# UTF-8 encoded characters
#
# Matt Bishop, MHI 289I, Fall 2024
#
import urllib.request
import re

debug = False
#
# ask for the URL
#
try:
    urlname = input("Please enter the URL: ")
except EOFError:
    print("Bye!")
except:
    print("That's not a valid string. ")
else:
    #
    # got it -- now try to read from the URL
    #
    try:
        webpage = urllib.request.urlopen(urlname)
    except Exception as msg:
        print("URL retrieval failed!", msg)
    else:
        # Read from the object, storing the page's contents in 's'.
        # and print it
        s = str(webpage.read().decode())
        ### debug: print out the web page
        if debug:
            print('Web page source code:[')
            print(s)
            print('----------')
        #
        # look for URLs on the web page
        # we apply re.search repeatedly, advancing the beginning of the
        # string to just beyond the end of the previous match
        # we also need not to go beyond the end of the string
        # so we stop before then
        #
        # get length to compare for end
        lens = len(s)
        # set n to index of beginning of string
        n = 0
        #
        # the loop -- get each URL and print it
        # lots of debugging code
        while n < lens:
            # do the search
            sp = re.search('<a href="([^"]*)">', s)
            if sp:
                # found one!
                if debug:
                    print(sp)
                # print it
                print(sp.group(1))
                if debug:
                    print(sp.end())
                # advance index to just beyond it
                n = sp.end()
                s = s[n:]
            else:
                # no more -- quit
                break