# Program to print all web links on a web page # It assumes the resource that the URL points to returns # UTF-8 encoded characters # # Matt Bishop, MHI 289I, Fall 2024 # import urllib.request import re debug = False # # ask for the URL # try: urlname = input("Please enter the URL: ") except EOFError: print("Bye!") except: print("That's not a valid string. ") else: # # got it -- now try to read from the URL # try: webpage = urllib.request.urlopen(urlname) except Exception as msg: print("URL retrieval failed!", msg) else: # Read from the object, storing the page's contents in 's'. # and print it s = str(webpage.read().decode()) ### debug: print out the web page if debug: print('Web page source code:[') print(s) print('----------') # # look for URLs on the web page # we apply re.search repeatedly, advancing the beginning of the # string to just beyond the end of the previous match # we also need not to go beyond the end of the string # so we stop before then # # get length to compare for end lens = len(s) # set n to index of beginning of string n = 0 # # the loop -- get each URL and print it # lots of debugging code while n < lens: # do the search sp = re.search('', s) if sp: # found one! if debug: print(sp) # print it print(sp.group(1)) if debug: print(sp.end()) # advance index to just beyond it n = sp.end() s = s[n:] else: # no more -- quit break