PYTHON PROGRAM RELATED TO INFORMATION RETRIEVAL AND WEB SEARCH
@author: nithin rao """ # importing libraries import requests from bs4 import BeautifulSoup def code(rows): rem = ["[", "]", "(", ")", "\xa0", "!", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"] data = [] url = [] for count,x in enumerate(rows): if "http" in x or "https" in x: url.append(x) if len(x) > 1: for i in rem: if i in x: rows[count] = rows[count].replace(i, "") data.append(rows[count]) cat1 = [".", ":", ","] for count,x in enumerate(data): if len(x) > 1: if "-" in x: data[count] = x.split("-")[0] data.append(x.split("-")[-1]) for i in cat1: if i == x[-1]: data[count] = data[count].replace(i, "") cat2 = [".", ":", ",", "/"] final_data = [] for count,x in enumerate(data): if len(x) > 1: for i in cat2: if i in x and x[:4] != "http": data[count] = data[count].replace(i, "") final_data.append(data[count]) return final_data urls = ["https://www.cs.memphis.edu/~vrus/teaching/ir-websearch/papers/codingStyle.html", "https://cs.memphis.edu/~vrus/teaching/ir-websearch/"] totalData = [] # getting source data for url in urls: r = requests.get(url) soup = BeautifulSoup(r.content, 'html5lib') columns = [] for j,s in enumerate(soup.findAll('body')): columns.append(''.join(s.findAll(text=True))) # removing new lines newLinesRemoved = columns[0].replace("\n", " ").replace("\t", " ") # removing comments in html rows = newLinesRemoved.replace("START LOADING THE PAGE", "") rows = rows.split(" ") final_data = code(rows) totalData.append(final_data) f = {} for data in totalData: for x in data: if len(x): word = x.lower() if word in f.keys(): f[word] +=1 else: f[word] = 1 print(f)