Write a python code

profileharipriyastudyx
Module3codes.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Python Basics (Instructor: Dr. Milad Baghersad)\n", "## Module 3: Web Scraping with Python Part 1\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "___\n", "### Importing a plain text file (locally)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filename = \"Tesla.txt\"\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file = open(filename, mode= \"r\") #read mode" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = file.read()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(filename, mode= \"r\") as file:\n", " text = file.read()\n", " print(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(filename, mode= \"w\") as file: #we can write\n", " file.write(text)\n", " #file.wrte(text)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(filename, mode= \"a\") as file: #we can append\n", " file.write(\"Apple\")\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filepath = \"C:\\\\Users\\\\Milad\\\\Desktop\\\\ISM 6405\\\\Module 3\\\\Tesla.txt\" #Windows\n", "#filepath = \"C:/Users/Milad/Desktop/ISM 6405/Module 3/Tesla.txt\" #mac" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(filepath, mode= \"r\") as file:\n", " text = file.read()\n", " print(text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "### Use pandas to import csv files (we learn more about pandas later):" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dealership_data = pd.read_csv(\"dealership.csv\", delimiter=\",\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(dealership_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dealership_data.at[0,\"Profit\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "___\n", "### Importing a plain text file from web:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example:\n", "U.S. Securities and Exchange Commission (SEC):\n", "\n", "https://www.sec.gov/Archives/edgar/full-index/\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#read the master file, open reports mentioned in the master file and check if a word exists in the report\n", "filepath = \"C:\\\\Users\\\\Milad\\\\Desktop\\\\ISM 6405\\\\Module 3\\\\master-2018Q1 - short version.idx\"\n", "\n", "with open(filepath,'r') as mastertext:\n", " content=mastertext.readlines()\n", " \n", " for row in content:\n", " print (row)\n", " \n", " if len(row)!= 0:\n", " row = row.strip('\\n')\n", " if str(row).endswith(\".txt\"):\n", " columns = row.split(\"|\")\n", " #print(columns)\n", " cik = columns[0]\n", " companyname = columns[1]\n", " formtype = columns[2]\n", " datefield = columns[3]\n", " filenames = columns[4]\n", " print(filenames)\n", " \n", " archivedUrl = \"https://www.sec.gov/Archives/\" + filenames\n", " print(archivedUrl)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#read the master file, open reports mentioned in the master file and check if a word exists in the report\n", "filepath = \"C:\\\\Users\\\\Milad\\\\Desktop\\\\ISM 6405\\\\Module 3\\\\master-2018Q1 - short version.idx\"\n", "\n", "with open(filepath,'r') as mastertext:\n", " content=mastertext.readlines()\n", " \n", " for row in content:\n", " print (row)\n", " \n", " if len(row)!= 0:\n", " row = row.strip('\\n')\n", " if str(row).endswith(\".txt\"):\n", " columns = row.split(\"|\")\n", " #print(columns)\n", " cik = columns[0]\n", " companyname = columns[1]\n", " formtype = columns[2]\n", " datefield = columns[3]\n", " filenames = columns[4]\n", " \n", " archivedUrl = \"https://www.sec.gov/Archives/\" + filenames\n", " print(archivedUrl)\n", " \n", " import requests\n", " response = requests.get(archivedUrl)\n", " print(response.content[:400])\n", " report_text = response.text\n", " if \"merger\" in report_text.lower():\n", " print(\"YES!\")\n", " else:\n", " print(\"NO\")\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#save 10-Q reports\n", "filepath = \"C:\\\\Users\\\\Milad\\\\Desktop\\\\ISM 6405\\\\Module 3\\\\master-2018Q1 - short version.idx\"\n", "\n", "with open(filepath,'r') as mastertext:\n", " content=mastertext.readlines()\n", " \n", " for row in content:\n", " print (row)\n", " \n", " if len(row)!= 0:\n", " row = row.strip('\\n')\n", " if str(row).endswith(\".txt\"):\n", " columns = row.split(\"|\")\n", " #print(columns)\n", " cik = columns[0]\n", " companyname = columns[1]\n", " formtype = columns[2]\n", " datefield = columns[3]\n", " filenames = columns[4]\n", " \n", " archivedUrl = \"https://www.sec.gov/Archives/\" + filenames\n", " print(archivedUrl)\n", " \n", " response = requests.get(archivedUrl)\n", " response.encoding = 'utf-8'\n", " print(response.content[:400])\n", " report_text = response.text\n", " \n", " filename = filenames.split(\"/\")[-1]\n", " if formtype == \"10-Q\":\n", " with open(filename, 'w') as f:\n", " f.write(report_text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### save the code as a function:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def downloadreports(filepath):\n", " with open(filepath,'r') as mastertext:\n", " content=mastertext.readlines()\n", " \n", " for row in content:\n", " print (row)\n", " \n", " if len(row)!= 0:\n", " row = row.strip('\\n')\n", " if str(row).endswith(\".txt\"):\n", " columns = row.split(\"|\")\n", " #print(columns)\n", " cik = columns[0]\n", " companyname = columns[1]\n", " formtype = columns[2]\n", " datefield = columns[3]\n", " filenames = columns[4]\n", " \n", " archivedUrl = \"https://www.sec.gov/Archives/\" + filenames\n", " print(archivedUrl)\n", " \n", " r = requests.get(archivedUrl)\n", " r.encoding = 'utf-8'\n", " print(r.content[:400])\n", " report_text = r.text\n", " \n", " filename = filenames.split(\"/\")[-1]\n", " if formtype == \"10-Q\":\n", " with open(filename, 'w') as f:\n", " f.write(report_text) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "downloadreports(filepath = \"C:\\\\Users\\\\Milad\\\\Desktop\\\\ISM 6405\\\\Module 3\\\\master-2018Q1 - short version.idx\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Use urllib library to save a file from web:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from urllib.request import urlretrieve" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "archivedUrl = \"https://www.sec.gov/Archives/edgar/data/1000045/0001193125-18-037381.txt\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "urlretrieve(archivedUrl, \"downloaded_urllib.txt\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "___\n", "______\n", "___\n", "___\n", "___\n", "___\n", "___\n", "\n", "### Extract information from HTML:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Example: save data about a product recall in https://www.cpsc.gov\n", "url = \"https://www.cpsc.gov/Recalls?combine=sofa&field_rc_date%5Bdate%5D=&field_rc_date_1%5Bdate%5D=\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "response = requests.get(url)\n", "response.status_code" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if response.status_code == 200:\n", " print(\"Success\")\n", "else:\n", " print(\"Failure\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "sofa_recall_text = response.text\n", "print(sofa_recall_text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### use BeautifulSoup package to add readability plus having useful functions\n", "https://www.crummy.com/software/BeautifulSoup/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "soup = BeautifulSoup(sofa_recall_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(soup.prettify())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Functions in BeautifulSoup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(soup.title)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(soup.title.get_text()) #get_text gets texts inside a tag" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# find_all finds all instances of a tag\n", "links = soup.find_all(\"a\") #hyperlinks are defined tag <a> in HTML\n", "print(links)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for link in links: \n", " print(link.get(\"href\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# find() finds the first instance of a tag\n", "first_link = soup.find('a')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(first_link)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### use functions to collect data from CPSC" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "recalled_sofas = soup.find_all(\"div\", class_=\"views-field views-field-php\")\n", "print(recalled_sofas)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "soup.find(\"div\", class_=\"views-field views-field-php\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "recall = soup.find(\"div\", class_=\"views-field views-field-php\")\n", "recall_link = recall.find(\"a\")\n", "link_url = recall_link.get('href')\n", "print(\"link url:\",link_url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "recall_title = recall.find(\"div\", class_=\"title\")\n", "print(recall_title)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "recall.find(\"div\", class_=\"title\").get_text()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "---\n", "---\n", "---\n", "\n", "### Write a function that returns recalls title, date, introduction, remedy, units, etc. when you enter a keyword" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_recalls_v1(keywords):\n", " \n", " import requests\n", " from bs4 import BeautifulSoup\n", " \n", " url = \"https://www.cpsc.gov/Recalls?combine=\" + keywords\n", " response = requests.get(url)\n", " if not response.status_code == 200:\n", " print(\"the link is broken\")\n", " return None\n", " \n", " recall_list = list()\n", " \n", " \n", " try:\n", " webpage_text = response.text\n", " soup = BeautifulSoup(webpage_text)\n", " recalls = soup.find_all(\"div\", class_=\"views-field views-field-php\")\n", " for recall in recalls:\n", " recall_title = recall.find(\"div\", class_=\"title\").get_text()\n", " \n", " recall_list.append((recall_title))\n", " return recall_list\n", " \n", " except:\n", " print(\"Error!!!\")\n", " return None\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_recalls_v1(\"sofa\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_recalls_v2(keywords):\n", " \n", " import requests\n", " from bs4 import BeautifulSoup\n", " \n", " url = \"https://www.cpsc.gov/Recalls?combine=\" + keywords\n", " response = requests.get(url)\n", " if not response.status_code == 200:\n", " print(\"the link is broken\")\n", " return None\n", " \n", " recall_list = list()\n", " \n", " \n", " try:\n", " webpage_text = response.text\n", " soup = BeautifulSoup(webpage_text)\n", " recalls = soup.find_all(\"div\", class_=\"views-field views-field-php\")\n", " for recall in recalls:\n", " recall_title = recall.find(\"div\", class_=\"title\").get_text()\n", " recall_date = recall.find(\"div\", class_=\"date\").get_text()\n", " recall_introduction = recall.find(\"div\", class_=\"introduction\").get_text()\n", " recall_remedy = recall.find(\"div\", class_=\"remedy\").get_text()\n", " recall_units = recall.find(\"div\", class_=\"units\").get_text()\n", " recall_link = recall.find(\"a\").get(\"href\")\n", " \n", " recall_list.append((recall_title, recall_date, recall_introduction, recall_remedy, recall_units, recall_link))\n", " return recall_list\n", " \n", " except:\n", " print(\"Error!!!\")\n", " return None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_recalls_v2(\"sofa\") ### will give error" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_recalls_v3(keywords):\n", " \n", " import requests\n", " from bs4 import BeautifulSoup\n", " \n", " url = \"https://www.cpsc.gov/Recalls?combine=\" + keywords\n", " response = requests.get(url)\n", " if not response.status_code == 200:\n", " print(\"the link is broken\")\n", " return None\n", " \n", " recall_list = list()\n", " \n", " \n", " try:\n", " webpage_text = response.text\n", " soup = BeautifulSoup(webpage_text)\n", " recalls = soup.find_all(\"div\", class_=\"views-field views-field-php\")\n", " for recall in recalls:\n", " recall_title = recall.find(\"div\", class_=\"title\").get_text()\n", " recall_date = recall.find(\"div\", class_=\"date\").get_text()\n", " recall_introduction = recall.find(\"div\", class_=\"introduction\").get_text()\n", " \n", " try:\n", " recall_remedy = recall.find(\"div\", class_=\"remedy\").get_text()\n", " except:\n", " recall_remedy = None\n", " try:\n", " recall_units = recall.find(\"div\", class_=\"units\").get_text()\n", " except:\n", " recall_units = None\n", " try:\n", " recall_link = recall.find(\"a\").get(\"href\")\n", " recall_link = \"https://www.cpsc.gov\" + recall_link\n", " except: \n", " return None\n", "\n", " recall_list.append((recall_title, recall_date, recall_introduction, recall_remedy, recall_units, recall_link))\n", " return recall_list\n", " \n", " except:\n", " print(\"Error!!!\")\n", " return None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_recalls_v3(\"sofa\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_recalls_v3(\"chair\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "---\n", "---\n", "---\n", "\n", "### Write a function that returns gets recall_link and returns (if possible) \"Manufactured In\":" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "recall_link = \"https://www.cpsc.gov/Recalls/2019/CVB-Recalls-LUCID-Folding-Mattress-Sofas-Due-to-Violation-of-Federal-Mattress-Flammability-Standard\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "response = requests.get(recall_link)\n", "response.status_code" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "webpage_text = response.text\n", "soup = BeautifulSoup(webpage_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "soup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "recall_product = soup.find_all(\"div\",class_=\"field\")\n", "print(recall_product)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for item in recall_product:\n", " if item.find(\"div\",class_=\"field-label\").get_text() == \"Manufactured In: \":\n", " manufactured_country = item.find(\"div\",class_=\"field-item\").get_text()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "manufactured_country" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### function:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_recall_manufactured_country(recall_link):\n", " \n", " import requests\n", " from bs4 import BeautifulSoup\n", " \n", " response = requests.get(recall_link)\n", " if not response.status_code == 200:\n", " print(\"the link is broken\")\n", " return None\n", " \n", " recall_info = list()\n", " \n", " \n", " try:\n", " webpage_text = response.text\n", " soup = BeautifulSoup(webpage_text)\n", " recall_product = soup.find_all(\"div\",class_=\"field\")\n", " try:\n", " for item in recall_product:\n", " if item.find(\"div\",class_=\"field-label\").get_text() == \"Manufactured In: \":\n", " manufactured_country = item.find(\"div\",class_=\"field-item\").get_text()\n", " return manufactured_country \n", " \n", " except:\n", " return None\n", " \n", " except:\n", " return None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_recall_manufactured_country(recall_link)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "---\n", "---\n", "---\n", "\n", "### add manufactured country information \"get_recalls_v3(keywords)\" function" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_recalls_v4(keywords):\n", " \n", " import requests\n", " from bs4 import BeautifulSoup\n", " \n", " url = \"https://www.cpsc.gov/Recalls?combine=\" + keywords\n", " response = requests.get(url)\n", " if not response.status_code == 200:\n", " print(\"the link is broken\")\n", " return None\n", " \n", " recall_list = list()\n", " \n", " \n", " try:\n", " webpage_text = response.text\n", " soup = BeautifulSoup(webpage_text)\n", " recalls = soup.find_all(\"div\", class_=\"views-field views-field-php\")\n", " for recall in recalls:\n", " recall_title = recall.find(\"div\", class_=\"title\").get_text()\n", " recall_date = recall.find(\"div\", class_=\"date\").get_text()\n", " recall_introduction = recall.find(\"div\", class_=\"introduction\").get_text()\n", " \n", " try:\n", " recall_remedy = recall.find(\"div\", class_=\"remedy\").get_text()\n", " except:\n", " recall_remedy = None\n", " try:\n", " recall_units = recall.find(\"div\", class_=\"units\").get_text()\n", " except:\n", " recall_units = None\n", " try:\n", " recall_link = recall.find(\"a\").get(\"href\")\n", " recall_link = \"https://www.cpsc.gov\" + recall_link\n", " manufactured_country = get_recall_manufactured_country(recall_link)\n", " except: \n", " return None\n", " \n", " recall_list.append((recall_title, recall_date, recall_introduction, recall_remedy, recall_units, recall_link,\n", " manufactured_country))\n", " return recall_list\n", " \n", " except:\n", " print(\"Error!!!\")\n", " return None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_recalls_v4(\"tv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }