Pandas exercise in Python (Jupiter Notebook)

profilenieyanan
PandasExercise.zip

Device.csv

User,Device,OS,Transactions,Recent Visit,Revenue A,Mobile,,5,4/5/2020 11:03, B,Mobile,Android6,3,8/12/2020 23:03, C,,iOS,7,8/7/2020 10:20, D,Tablet,Android,1,7/18/2020 21:13, E,PC,Win,4,8/17/2020 1:23, F,iPad,iOS,,6/24/2020 15:23, G,PC,Windows10,,5/1/2020 12:33, H,iPhone 6,iOS5,2,6/12/2020 8:33, A,Mobile,,5,4/5/2020 11:03, B,Mobile,Android6,3,8/12/2020 23:03,

Pandas Exercise.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise for Pandas \n", "files device.csv and user.csv are needed." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "John 3.5\n", "Amy 4.1\n", "Elle 3.8\n", "Kevin 3.8\n", "dtype: float64\n" ] } ], "source": [ "# Create a series \"GPA\" with a Python list [3.5, 4.1, 3.8, 3.8], Provide the names as the index\n", "# John, Amy, Elle, Kevin\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4\n" ] } ], "source": [ "# Get the number of values in 'GPA' with the size attribute\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "John 3.5\n", "Amy 4.1\n", "Elle 3.8\n", "Kevin 3.8\n", "Emma 3.9\n", "dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use indexing operator [] to add Emma with 3.9 as her GPA\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([3.5, 4.1, 3.8, 3.9])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the unique values of the series 'GPA' with unique() method\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.8 2\n", "4.1 1\n", "3.9 1\n", "3.5 1\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the frequency of unique values using value_counts() method\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.8 0.4\n", "4.1 0.2\n", "3.9 0.2\n", "3.5 0.2\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get % of the numbers of unique values using value_counts()\n", "# An argument must be passed in\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.8" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get GPA of Kevin's using index operator []\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "John 3.5\n", "Amy 4.1\n", "Elle 3.8\n", "dtype: float64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get GPA of the first 3 people using slicing\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Elle 3.8\n", "Kevin 3.8\n", "dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the elements with GPA value of 3.8\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>John</th>\n", " <td>25</td>\n", " <td>M</td>\n", " </tr>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " </tr>\n", " <tr>\n", " <th>Elle</th>\n", " <td>28</td>\n", " <td>F</td>\n", " </tr>\n", " <tr>\n", " <th>Kevin</th>\n", " <td>26</td>\n", " <td>M</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender\n", "John 25 M\n", "Amy 27 F\n", "Elle 28 F\n", "Kevin 26 M" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a dataframe 'student' with list: [(25, 'M'), (27, 'F'), (28, 'F'), (26, 'M')]\n", "# index ['John', 'Amy', 'Elle', 'Kevin'] and columns ['Age', 'Gender']\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " <th>Country</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>John</th>\n", " <td>25</td>\n", " <td>M</td>\n", " <td>US</td>\n", " </tr>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>China</td>\n", " </tr>\n", " <tr>\n", " <th>Elle</th>\n", " <td>28</td>\n", " <td>F</td>\n", " <td>UK</td>\n", " </tr>\n", " <tr>\n", " <th>Kevin</th>\n", " <td>26</td>\n", " <td>M</td>\n", " <td>China</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender Country\n", "John 25 M US\n", "Amy 27 F China\n", "Elle 28 F UK\n", "Kevin 26 M China" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Add a column country with the values: US, China, UK, China\n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " <th>Country</th>\n", " <th>GPA</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>John</th>\n", " <td>25</td>\n", " <td>M</td>\n", " <td>US</td>\n", " <td>3.5</td>\n", " </tr>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>China</td>\n", " <td>4.1</td>\n", " </tr>\n", " <tr>\n", " <th>Elle</th>\n", " <td>28</td>\n", " <td>F</td>\n", " <td>UK</td>\n", " <td>3.8</td>\n", " </tr>\n", " <tr>\n", " <th>Kevin</th>\n", " <td>26</td>\n", " <td>M</td>\n", " <td>China</td>\n", " <td>3.8</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender Country GPA\n", "John 25 M US 3.5\n", "Amy 27 F China 4.1\n", "Elle 28 F UK 3.8\n", "Kevin 26 M China 3.8" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Add GPA created before as a column to 'student'\n", "\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4, 4)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the shape attribute of the 'student' dataframe\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Age', 'Gender', 'Country', 'GPA'], dtype='object')" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the columns attribute of the 'student' dataframe\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Age 27\n", "Gender F\n", "Country China\n", "GPA 4.1\n", "Name: Amy, dtype: object" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get all details of Amy's using loc[] operator\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Age 26\n", "Gender M\n", "Country China\n", "GPA 3.8\n", "Name: Kevin, dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the details of the student in the 4th row with iloc[] operator\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>John</th>\n", " <td>25</td>\n", " <td>M</td>\n", " </tr>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " </tr>\n", " <tr>\n", " <th>Elle</th>\n", " <td>28</td>\n", " <td>F</td>\n", " </tr>\n", " <tr>\n", " <th>Kevin</th>\n", " <td>26</td>\n", " <td>M</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender\n", "John 25 M\n", "Amy 27 F\n", "Elle 28 F\n", "Kevin 26 M" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get two columns: Age and Gender\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>GPA</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>4.1</td>\n", " </tr>\n", " <tr>\n", " <th>Elle</th>\n", " <td>28</td>\n", " <td>3.8</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age GPA\n", "Amy 27 4.1\n", "Elle 28 3.8" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use loc[] to get Amy and Elles' Age and GPA\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "28" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the age of the oldest student (use max() method)\n", "\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " <th>Country</th>\n", " <th>GPA</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>China</td>\n", " <td>4.1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender Country GPA\n", "Amy 27 F China 4.1" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get female students whose GPA are higher than 3.8\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " <th>Country</th>\n", " <th>GPA</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>John</th>\n", " <td>25</td>\n", " <td>M</td>\n", " <td>US</td>\n", " <td>3.5</td>\n", " </tr>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>China</td>\n", " <td>4.1</td>\n", " </tr>\n", " <tr>\n", " <th>Elle</th>\n", " <td>28</td>\n", " <td>F</td>\n", " <td>UK</td>\n", " <td>4.0</td>\n", " </tr>\n", " <tr>\n", " <th>Kevin</th>\n", " <td>26</td>\n", " <td>M</td>\n", " <td>China</td>\n", " <td>3.8</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender Country GPA\n", "John 25 M US 3.5\n", "Amy 27 F China 4.1\n", "Elle 28 F UK 4.0\n", "Kevin 26 M China 3.8" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Change Elle's GPA to be 4.0 using loc[]\n", "\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " <th>Country</th>\n", " <th>GPA</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>John</th>\n", " <td>25</td>\n", " <td>M</td>\n", " <td>US</td>\n", " <td>3.5</td>\n", " </tr>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>China</td>\n", " <td>4.1</td>\n", " </tr>\n", " <tr>\n", " <th>Kevin</th>\n", " <td>26</td>\n", " <td>M</td>\n", " <td>China</td>\n", " <td>3.8</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender Country GPA\n", "John 25 M US 3.5\n", "Amy 27 F China 4.1\n", "Kevin 26 M China 3.8" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get all students from China or the US using loc[]. (Use isin() method also)\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Age Gender Country GPA\n", "John 25 M US 3.5\n", "Amy 27 F China 4.1\n", "Elle 28 F UK 4.0\n", "Kevin 26 M China 3.8\n", " Age Gender GPA\n", "John 25 M 3.5\n", "Amy 27 F 4.1\n", "Elle 28 F 4.0\n", "Kevin 26 M 3.8\n" ] } ], "source": [ "# Drop country column from 'student' and store the result in 'student2'\n", "# print out both 'student' and 'student2'. Note the difference\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " <th>Country</th>\n", " <th>GPA</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Kevin</th>\n", " <td>26</td>\n", " <td>M</td>\n", " <td>China</td>\n", " <td>3.8</td>\n", " </tr>\n", " <tr>\n", " <th>John</th>\n", " <td>25</td>\n", " <td>M</td>\n", " <td>US</td>\n", " <td>3.5</td>\n", " </tr>\n", " <tr>\n", " <th>Elle</th>\n", " <td>28</td>\n", " <td>F</td>\n", " <td>UK</td>\n", " <td>4.0</td>\n", " </tr>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>China</td>\n", " <td>4.1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender Country GPA\n", "Kevin 26 M China 3.8\n", "John 25 M US 3.5\n", "Elle 28 F UK 4.0\n", "Amy 27 F China 4.1" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sort 'student' by index\n", "# Sort 'student' by index in descending order\n", "\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " <th>Country</th>\n", " <th>GPA</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>John</th>\n", " <td>25</td>\n", " <td>M</td>\n", " <td>US</td>\n", " <td>3.5</td>\n", " </tr>\n", " <tr>\n", " <th>Elle</th>\n", " <td>28</td>\n", " <td>F</td>\n", " <td>UK</td>\n", " <td>4.0</td>\n", " </tr>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>China</td>\n", " <td>4.1</td>\n", " </tr>\n", " <tr>\n", " <th>Kevin</th>\n", " <td>26</td>\n", " <td>M</td>\n", " <td>China</td>\n", " <td>3.8</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender Country GPA\n", "John 25 M US 3.5\n", "Elle 28 F UK 4.0\n", "Amy 27 F China 4.1\n", "Kevin 26 M China 3.8" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sort 'student' by GPA column\n", "\n", "# Sort 'student' by Country, GPA columns, in descending order\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " <th>Country</th>\n", " <th>GPA</th>\n", " <th>GPA-changed</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>John</th>\n", " <td>25</td>\n", " <td>M</td>\n", " <td>US</td>\n", " <td>3.5</td>\n", " <td>2.80</td>\n", " </tr>\n", " <tr>\n", " <th>Amy</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>China</td>\n", " <td>4.1</td>\n", " <td>3.28</td>\n", " </tr>\n", " <tr>\n", " <th>Elle</th>\n", " <td>28</td>\n", " <td>F</td>\n", " <td>UK</td>\n", " <td>4.0</td>\n", " <td>3.20</td>\n", " </tr>\n", " <tr>\n", " <th>Kevin</th>\n", " <td>26</td>\n", " <td>M</td>\n", " <td>China</td>\n", " <td>3.8</td>\n", " <td>3.04</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Age Gender Country GPA GPA-changed\n", "John 25 M US 3.5 2.80\n", "Amy 27 F China 4.1 3.28\n", "Elle 28 F UK 4.0 3.20\n", "Kevin 26 M China 3.8 3.04" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# All the GPAs are out of 5, change them to be based on 4\n", "# Save results to a column \"GPA-changed\"\n", "\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Age GPA GPA-changed\n", "count 4.000000 4.000000 4.00000\n", "mean 26.500000 3.850000 3.08000\n", "std 1.290994 0.264575 0.21166\n", "min 25.000000 3.500000 2.80000\n", "25% 25.750000 3.725000 2.98000\n", "50% 26.500000 3.900000 3.12000\n", "75% 27.250000 4.025000 3.22000\n", "max 28.000000 4.100000 3.28000\n" ] } ], "source": [ "# Use describe() method to get descriptive statistics\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Gender Country\n", "count 4 4\n", "unique 2 3\n", "top M China\n", "freq 2 2\n" ] } ], "source": [ "# Use describe() to get descriptive statistics of text values\n" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " <th>Revenue</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>NaN</td>\n", " <td>5.0</td>\n", " <td>4/5/2020 11:03</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.0</td>\n", " <td>8/12/2020 23:03</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>NaN</td>\n", " <td>iOS</td>\n", " <td>7.0</td>\n", " <td>8/7/2020 10:20</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.0</td>\n", " <td>7/18/2020 21:13</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.0</td>\n", " <td>8/17/2020 1:23</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>NaN</td>\n", " <td>6/24/2020 15:23</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>NaN</td>\n", " <td>5/1/2020 12:33</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.0</td>\n", " <td>6/12/2020 8:33</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>NaN</td>\n", " <td>5.0</td>\n", " <td>4/5/2020 11:03</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.0</td>\n", " <td>8/12/2020 23:03</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit Revenue\n", "0 A Mobile NaN 5.0 4/5/2020 11:03 NaN\n", "1 B Mobile Android6 3.0 8/12/2020 23:03 NaN\n", "2 C NaN iOS 7.0 8/7/2020 10:20 NaN\n", "3 D Tablet Android 1.0 7/18/2020 21:13 NaN\n", "4 E PC Win 4.0 8/17/2020 1:23 NaN\n", "5 F iPad iOS NaN 6/24/2020 15:23 NaN\n", "6 G PC Windows10 NaN 5/1/2020 12:33 NaN\n", "7 H iPhone 6 iOS5 2.0 6/12/2020 8:33 NaN\n", "8 A Mobile NaN 5.0 4/5/2020 11:03 NaN\n", "9 B Mobile Android6 3.0 8/12/2020 23:03 NaN" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load data from device.csv into 'device_df'\n", "# Display it\n", "\n" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 10 entries, 0 to 9\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 User 10 non-null object \n", " 1 Device 9 non-null object \n", " 2 OS 8 non-null object \n", " 3 Transactions 8 non-null float64\n", " 4 Recent Visit 10 non-null object \n", " 5 Revenue 0 non-null float64\n", "dtypes: float64(2), object(4)\n", "memory usage: 608.0+ bytes\n" ] } ], "source": [ "# Use info() to get the number of valid values in each column\n" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " <th>Revenue</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "Empty DataFrame\n", "Columns: [User, Device, OS, Transactions, Recent Visit, Revenue]\n", "Index: []" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use dropna() to drop rows \n" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>NaN</td>\n", " <td>5.0</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.0</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>NaN</td>\n", " <td>iOS</td>\n", " <td>7.0</td>\n", " <td>8/7/2020 10:20</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.0</td>\n", " <td>7/18/2020 21:13</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.0</td>\n", " <td>8/17/2020 1:23</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>NaN</td>\n", " <td>6/24/2020 15:23</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>NaN</td>\n", " <td>5/1/2020 12:33</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.0</td>\n", " <td>6/12/2020 8:33</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>NaN</td>\n", " <td>5.0</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.0</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit\n", "0 A Mobile NaN 5.0 4/5/2020 11:03\n", "1 B Mobile Android6 3.0 8/12/2020 23:03\n", "2 C NaN iOS 7.0 8/7/2020 10:20\n", "3 D Tablet Android 1.0 7/18/2020 21:13\n", "4 E PC Win 4.0 8/17/2020 1:23\n", "5 F iPad iOS NaN 6/24/2020 15:23\n", "6 G PC Windows10 NaN 5/1/2020 12:33\n", "7 H iPhone 6 iOS5 2.0 6/12/2020 8:33\n", "8 A Mobile NaN 5.0 4/5/2020 11:03\n", "9 B Mobile Android6 3.0 8/12/2020 23:03" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Only drop columns with no valid values\n", "# Save the result back to 'device_df' so the original data is changed\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Missing</td>\n", " <td>5</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Missing</td>\n", " <td>iOS</td>\n", " <td>7</td>\n", " <td>8/7/2020 10:20</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1</td>\n", " <td>7/18/2020 21:13</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4</td>\n", " <td>8/17/2020 1:23</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>Missing</td>\n", " <td>6/24/2020 15:23</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>Missing</td>\n", " <td>5/1/2020 12:33</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2</td>\n", " <td>6/12/2020 8:33</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Missing</td>\n", " <td>5</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit\n", "0 A Mobile Missing 5 4/5/2020 11:03\n", "1 B Mobile Android6 3 8/12/2020 23:03\n", "2 C Missing iOS 7 8/7/2020 10:20\n", "3 D Tablet Android 1 7/18/2020 21:13\n", "4 E PC Win 4 8/17/2020 1:23\n", "5 F iPad iOS Missing 6/24/2020 15:23\n", "6 G PC Windows10 Missing 5/1/2020 12:33\n", "7 H iPhone 6 iOS5 2 6/12/2020 8:33\n", "8 A Mobile Missing 5 4/5/2020 11:03\n", "9 B Mobile Android6 3 8/12/2020 23:03" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fillna() with a constant \"Missing\"\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>NaN</td>\n", " <td>5.0</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.0</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.0</td>\n", " <td>8/7/2020 10:20</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.0</td>\n", " <td>7/18/2020 21:13</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.0</td>\n", " <td>8/17/2020 1:23</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>4.0</td>\n", " <td>6/24/2020 15:23</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>4.0</td>\n", " <td>5/1/2020 12:33</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.0</td>\n", " <td>6/12/2020 8:33</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>iOS5</td>\n", " <td>5.0</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.0</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit\n", "0 A Mobile NaN 5.0 4/5/2020 11:03\n", "1 B Mobile Android6 3.0 8/12/2020 23:03\n", "2 C Mobile iOS 7.0 8/7/2020 10:20\n", "3 D Tablet Android 1.0 7/18/2020 21:13\n", "4 E PC Win 4.0 8/17/2020 1:23\n", "5 F iPad iOS 4.0 6/24/2020 15:23\n", "6 G PC Windows10 4.0 5/1/2020 12:33\n", "7 H iPhone 6 iOS5 2.0 6/12/2020 8:33\n", "8 A Mobile iOS5 5.0 4/5/2020 11:03\n", "9 B Mobile Android6 3.0 8/12/2020 23:03" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Try to fillna() with ffill\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit\n", "0 A Mobile Android6 5.00 4/5/2020 11:03\n", "1 B Mobile Android6 3.00 8/12/2020 23:03\n", "2 C Mobile iOS 7.00 8/7/2020 10:20\n", "3 D Tablet Android 1.00 7/18/2020 21:13\n", "4 E PC Win 4.00 8/17/2020 1:23\n", "5 F iPad iOS 3.75 6/24/2020 15:23\n", "6 G PC Windows10 3.75 5/1/2020 12:33\n", "7 H iPhone 6 iOS5 2.00 6/12/2020 8:33\n", "8 A Mobile Android6 5.00 4/5/2020 11:03\n", "9 B Mobile Android6 3.00 8/12/2020 23:03" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fillna() of Device and OS columns using mode, Transactions column using mean\n", "# Save the result to 'device_df'\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit\n", "0 A Mobile Android6 5.00 4/5/2020 11:03\n", "1 B Mobile Android6 3.00 8/12/2020 23:03\n", "2 C Mobile iOS 7.00 8/7/2020 10:20\n", "3 D Tablet Android 1.00 7/18/2020 21:13\n", "4 E PC Win 4.00 8/17/2020 1:23\n", "5 F iPad iOS 3.75 6/24/2020 15:23\n", "6 G PC Windows10 3.75 5/1/2020 12:33\n", "7 H iPhone 6 iOS5 2.00 6/12/2020 8:33" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Drop duplicates in 'device_df'. Save results to 'device_df'\n", "\n" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>Tablet</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit\n", "0 A Mobile Android 5.00 4/5/2020 11:03\n", "1 B Mobile Android 3.00 8/12/2020 23:03\n", "2 C Mobile iOS 7.00 8/7/2020 10:20\n", "3 D Tablet Android 1.00 7/18/2020 21:13\n", "4 E PC Win 4.00 8/17/2020 1:23\n", "5 F Tablet iOS 3.75 6/24/2020 15:23\n", "6 G PC Win 3.75 5/1/2020 12:33\n", "7 H Mobile iOS 2.00 6/12/2020 8:33" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make values in columns consistant. In Device only values should be\n", "# 'Mobile', 'Tablet', and 'PC'. In OS, should have 'Android', 'iOS', 'Win' only\n" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 (3.0, 5.0]\n", "1 (0.994, 3.0]\n", "2 (5.0, 7.0]\n", "3 (0.994, 3.0]\n", "4 (3.0, 5.0]\n", "5 (3.0, 5.0]\n", "6 (3.0, 5.0]\n", "7 (0.994, 3.0]\n", "Name: Transactions, dtype: category\n", "Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Bin Transactions into 3 equal-range groups\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\jieli\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " This is separate from the ipykernel package so we can avoid doing imports until\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " <th>Status</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " <td>VIP</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " <td>Classic</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " <td>VIP</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " <td>Classic</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " <td>VIP</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " <td>VIP</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " <td>VIP</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " <td>Classic</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit Status\n", "0 A Mobile Android6 5.00 4/5/2020 11:03 VIP\n", "1 B Mobile Android6 3.00 8/12/2020 23:03 Classic\n", "2 C Mobile iOS 7.00 8/7/2020 10:20 VIP\n", "3 D Tablet Android 1.00 7/18/2020 21:13 Classic\n", "4 E PC Win 4.00 8/17/2020 1:23 VIP\n", "5 F iPad iOS 3.75 6/24/2020 15:23 VIP\n", "6 G PC Windows10 3.75 5/1/2020 12:33 VIP\n", "7 H iPhone 6 iOS5 2.00 6/12/2020 8:33 Classic" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Bin Transactions with cutoff points: 0, 3, 10. Save the bins to column\n", "# named 'Status'. Higer value should be labled VIP and lower value should be labelled Classic \n", "# It is safe to ignore the warnings with red background\n", "\n" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2020-04-05 11:03:00', '2020-08-12 23:03:00',\n", " '2020-08-07 10:20:00', '2020-07-18 21:13:00',\n", " '2020-08-17 01:23:00', '2020-06-24 15:23:00',\n", " '2020-05-01 12:33:00', '2020-06-12 08:33:00'],\n", " dtype='datetime64[ns]', name='Recent Visit', freq=None)" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get a DatetimeIndexof() the Recent Visit column. Save it to 'dt_index'\n", "\n" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\jieli\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " \n", "C:\\Users\\jieli\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " This is separate from the ipykernel package so we can avoid doing imports until\n", "C:\\Users\\jieli\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " after removing the cwd from sys.path.\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " <th>Status</th>\n", " <th>Year</th>\n", " <th>Quarter</th>\n", " <th>WeekDay</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>6</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>5</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>4</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit Status Year \\\n", "0 A Mobile Android6 5.00 4/5/2020 11:03 VIP 2020 \n", "1 B Mobile Android6 3.00 8/12/2020 23:03 Classic 2020 \n", "2 C Mobile iOS 7.00 8/7/2020 10:20 VIP 2020 \n", "3 D Tablet Android 1.00 7/18/2020 21:13 Classic 2020 \n", "4 E PC Win 4.00 8/17/2020 1:23 VIP 2020 \n", "5 F iPad iOS 3.75 6/24/2020 15:23 VIP 2020 \n", "6 G PC Windows10 3.75 5/1/2020 12:33 VIP 2020 \n", "7 H iPhone 6 iOS5 2.00 6/12/2020 8:33 Classic 2020 \n", "\n", " Quarter WeekDay \n", "0 2 6 \n", "1 3 2 \n", "2 3 4 \n", "3 3 5 \n", "4 3 0 \n", "5 2 2 \n", "6 2 4 \n", "7 2 4 " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Add Year, Quarter and WeekDay as columns to 'device_df'\n", "# It is safe to ignore the warnings with red background\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Mobile</th>\n", " <th>PC</th>\n", " <th>Tablet</th>\n", " <th>iPad</th>\n", " <th>iPhone 6</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Mobile PC Tablet iPad iPhone 6\n", "0 1 0 0 0 0\n", "1 1 0 0 0 0\n", "2 1 0 0 0 0\n", "3 0 0 1 0 0\n", "4 0 1 0 0 0\n", "5 0 0 0 1 0\n", "6 0 1 0 0 0\n", "7 0 0 0 0 1" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert Device to dummy variables and save it to 'dummies' variable\n", "\n" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " <th>Status</th>\n", " <th>Year</th>\n", " <th>Quarter</th>\n", " <th>WeekDay</th>\n", " <th>Mobile</th>\n", " <th>PC</th>\n", " <th>Tablet</th>\n", " <th>iPad</th>\n", " <th>iPhone 6</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>6</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit Status Year \\\n", "0 A Mobile Android6 5.00 4/5/2020 11:03 VIP 2020 \n", "1 B Mobile Android6 3.00 8/12/2020 23:03 Classic 2020 \n", "2 C Mobile iOS 7.00 8/7/2020 10:20 VIP 2020 \n", "3 D Tablet Android 1.00 7/18/2020 21:13 Classic 2020 \n", "4 E PC Win 4.00 8/17/2020 1:23 VIP 2020 \n", "5 F iPad iOS 3.75 6/24/2020 15:23 VIP 2020 \n", "6 G PC Windows10 3.75 5/1/2020 12:33 VIP 2020 \n", "7 H iPhone 6 iOS5 2.00 6/12/2020 8:33 Classic 2020 \n", "\n", " Quarter WeekDay Mobile PC Tablet iPad iPhone 6 \n", "0 2 6 1 0 0 0 0 \n", "1 3 2 1 0 0 0 0 \n", "2 3 4 1 0 0 0 0 \n", "3 3 5 0 0 1 0 0 \n", "4 3 0 0 1 0 0 0 \n", "5 2 2 0 0 0 1 0 \n", "6 2 4 0 1 0 0 0 \n", "7 2 4 0 0 0 0 1 " ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# merge 'dummies' to 'device_df'. Name the new one 'device_df_dummies'\n", "\n" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Name</th>\n", " <th>Gender</th>\n", " <th>Age</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>F</td>\n", " <td>45</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>M</td>\n", " <td>33</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>F</td>\n", " <td>21</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>M</td>\n", " <td>19</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>F</td>\n", " <td>67</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>M</td>\n", " <td>28</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>I</td>\n", " <td>F</td>\n", " <td>55</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>J</td>\n", " <td>M</td>\n", " <td>43</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>K</td>\n", " <td>M</td>\n", " <td>29</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Name Gender Age\n", "0 A F 45\n", "1 B M 33\n", "2 C F 21\n", "3 D M 19\n", "4 E F 34\n", "5 F F 67\n", "6 G M 28\n", "7 H F 34\n", "8 I F 55\n", "9 J M 43\n", "10 K M 29" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load data in user.csv into user_df. Display the data\n", "\n" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "# Merge user_df and device_df_dummies with inner join\n", "# Save the result to device_user_df\n" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " <th>Status</th>\n", " <th>Year</th>\n", " <th>Quarter</th>\n", " <th>WeekDay</th>\n", " <th>Mobile</th>\n", " <th>PC</th>\n", " <th>Tablet</th>\n", " <th>iPad</th>\n", " <th>iPhone 6</th>\n", " <th>Gender</th>\n", " <th>Age</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>6</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>F</td>\n", " <td>45</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>M</td>\n", " <td>33</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>F</td>\n", " <td>21</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>M</td>\n", " <td>19</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>F</td>\n", " <td>67</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>M</td>\n", " <td>28</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit Status Year \\\n", "0 A Mobile Android6 5.00 4/5/2020 11:03 VIP 2020 \n", "1 B Mobile Android6 3.00 8/12/2020 23:03 Classic 2020 \n", "2 C Mobile iOS 7.00 8/7/2020 10:20 VIP 2020 \n", "3 D Tablet Android 1.00 7/18/2020 21:13 Classic 2020 \n", "4 E PC Win 4.00 8/17/2020 1:23 VIP 2020 \n", "5 F iPad iOS 3.75 6/24/2020 15:23 VIP 2020 \n", "6 G PC Windows10 3.75 5/1/2020 12:33 VIP 2020 \n", "7 H iPhone 6 iOS5 2.00 6/12/2020 8:33 Classic 2020 \n", "\n", " Quarter WeekDay Mobile PC Tablet iPad iPhone 6 Gender Age \n", "0 2 6 1 0 0 0 0 F 45 \n", "1 3 2 1 0 0 0 0 M 33 \n", "2 3 4 1 0 0 0 0 F 21 \n", "3 3 5 0 0 1 0 0 M 19 \n", "4 3 0 0 1 0 0 0 F 34 \n", "5 2 2 0 0 0 1 0 F 67 \n", "6 2 4 0 1 0 0 0 M 28 \n", "7 2 4 0 0 0 0 1 F 34 " ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Drop the Name column in 'device_user_df'since it is a duplicate. How many columns left?\n", "\n" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " <th>Status</th>\n", " <th>Year</th>\n", " <th>Quarter</th>\n", " <th>WeekDay</th>\n", " <th>Mobile</th>\n", " <th>PC</th>\n", " <th>Tablet</th>\n", " <th>iPad</th>\n", " <th>iPhone 6</th>\n", " <th>Name</th>\n", " <th>Gender</th>\n", " <th>Age</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>2.0</td>\n", " <td>6.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>A</td>\n", " <td>F</td>\n", " <td>45</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " <td>Classic</td>\n", " <td>2020.0</td>\n", " <td>3.0</td>\n", " <td>2.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>B</td>\n", " <td>M</td>\n", " <td>33</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>3.0</td>\n", " <td>4.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>C</td>\n", " <td>F</td>\n", " <td>21</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " <td>Classic</td>\n", " <td>2020.0</td>\n", " <td>3.0</td>\n", " <td>5.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>D</td>\n", " <td>M</td>\n", " <td>19</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>3.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>E</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>2.0</td>\n", " <td>2.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>F</td>\n", " <td>F</td>\n", " <td>67</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>2.0</td>\n", " <td>4.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>G</td>\n", " <td>M</td>\n", " <td>28</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " <td>Classic</td>\n", " <td>2020.0</td>\n", " <td>2.0</td>\n", " <td>4.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>H</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>I</td>\n", " <td>F</td>\n", " <td>55</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>J</td>\n", " <td>M</td>\n", " <td>43</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>K</td>\n", " <td>M</td>\n", " <td>29</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit Status Year \\\n", "0 A Mobile Android6 5.00 4/5/2020 11:03 VIP 2020.0 \n", "1 B Mobile Android6 3.00 8/12/2020 23:03 Classic 2020.0 \n", "2 C Mobile iOS 7.00 8/7/2020 10:20 VIP 2020.0 \n", "3 D Tablet Android 1.00 7/18/2020 21:13 Classic 2020.0 \n", "4 E PC Win 4.00 8/17/2020 1:23 VIP 2020.0 \n", "5 F iPad iOS 3.75 6/24/2020 15:23 VIP 2020.0 \n", "6 G PC Windows10 3.75 5/1/2020 12:33 VIP 2020.0 \n", "7 H iPhone 6 iOS5 2.00 6/12/2020 8:33 Classic 2020.0 \n", "8 NaN NaN NaN NaN NaN NaN NaN \n", "9 NaN NaN NaN NaN NaN NaN NaN \n", "10 NaN NaN NaN NaN NaN NaN NaN \n", "\n", " Quarter WeekDay Mobile PC Tablet iPad iPhone 6 Name Gender Age \n", "0 2.0 6.0 1.0 0.0 0.0 0.0 0.0 A F 45 \n", "1 3.0 2.0 1.0 0.0 0.0 0.0 0.0 B M 33 \n", "2 3.0 4.0 1.0 0.0 0.0 0.0 0.0 C F 21 \n", "3 3.0 5.0 0.0 0.0 1.0 0.0 0.0 D M 19 \n", "4 3.0 0.0 0.0 1.0 0.0 0.0 0.0 E F 34 \n", "5 2.0 2.0 0.0 0.0 0.0 1.0 0.0 F F 67 \n", "6 2.0 4.0 0.0 1.0 0.0 0.0 0.0 G M 28 \n", "7 2.0 4.0 0.0 0.0 0.0 0.0 1.0 H F 34 \n", "8 NaN NaN NaN NaN NaN NaN NaN I F 55 \n", "9 NaN NaN NaN NaN NaN NaN NaN J M 43 \n", "10 NaN NaN NaN NaN NaN NaN NaN K M 29 " ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Try outer join, left join and right join to see the difference\n" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " <th>Status</th>\n", " <th>Year</th>\n", " <th>Quarter</th>\n", " <th>WeekDay</th>\n", " <th>Mobile</th>\n", " <th>PC</th>\n", " <th>Tablet</th>\n", " <th>iPad</th>\n", " <th>iPhone 6</th>\n", " <th>Name</th>\n", " <th>Gender</th>\n", " <th>Age</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>6</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>A</td>\n", " <td>F</td>\n", " <td>45</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>B</td>\n", " <td>M</td>\n", " <td>33</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>C</td>\n", " <td>F</td>\n", " <td>21</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>D</td>\n", " <td>M</td>\n", " <td>19</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>E</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>F</td>\n", " <td>F</td>\n", " <td>67</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " <td>VIP</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>G</td>\n", " <td>M</td>\n", " <td>28</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " <td>Classic</td>\n", " <td>2020</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>H</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit Status Year \\\n", "0 A Mobile Android6 5.00 4/5/2020 11:03 VIP 2020 \n", "1 B Mobile Android6 3.00 8/12/2020 23:03 Classic 2020 \n", "2 C Mobile iOS 7.00 8/7/2020 10:20 VIP 2020 \n", "3 D Tablet Android 1.00 7/18/2020 21:13 Classic 2020 \n", "4 E PC Win 4.00 8/17/2020 1:23 VIP 2020 \n", "5 F iPad iOS 3.75 6/24/2020 15:23 VIP 2020 \n", "6 G PC Windows10 3.75 5/1/2020 12:33 VIP 2020 \n", "7 H iPhone 6 iOS5 2.00 6/12/2020 8:33 Classic 2020 \n", "\n", " Quarter WeekDay Mobile PC Tablet iPad iPhone 6 Name Gender Age \n", "0 2 6 1 0 0 0 0 A F 45 \n", "1 3 2 1 0 0 0 0 B M 33 \n", "2 3 4 1 0 0 0 0 C F 21 \n", "3 3 5 0 0 1 0 0 D M 19 \n", "4 3 0 0 1 0 0 0 E F 34 \n", "5 2 2 0 0 0 1 0 F F 67 \n", "6 2 4 0 1 0 0 0 G M 28 \n", "7 2 4 0 0 0 0 1 H F 34 " ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Try outer join, left join and right join to see the difference\n" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>User</th>\n", " <th>Device</th>\n", " <th>OS</th>\n", " <th>Transactions</th>\n", " <th>Recent Visit</th>\n", " <th>Status</th>\n", " <th>Year</th>\n", " <th>Quarter</th>\n", " <th>WeekDay</th>\n", " <th>Mobile</th>\n", " <th>PC</th>\n", " <th>Tablet</th>\n", " <th>iPad</th>\n", " <th>iPhone 6</th>\n", " <th>Name</th>\n", " <th>Gender</th>\n", " <th>Age</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>A</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>5.00</td>\n", " <td>4/5/2020 11:03</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>2.0</td>\n", " <td>6.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>A</td>\n", " <td>F</td>\n", " <td>45</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>B</td>\n", " <td>Mobile</td>\n", " <td>Android6</td>\n", " <td>3.00</td>\n", " <td>8/12/2020 23:03</td>\n", " <td>Classic</td>\n", " <td>2020.0</td>\n", " <td>3.0</td>\n", " <td>2.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>B</td>\n", " <td>M</td>\n", " <td>33</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C</td>\n", " <td>Mobile</td>\n", " <td>iOS</td>\n", " <td>7.00</td>\n", " <td>8/7/2020 10:20</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>3.0</td>\n", " <td>4.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>C</td>\n", " <td>F</td>\n", " <td>21</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>D</td>\n", " <td>Tablet</td>\n", " <td>Android</td>\n", " <td>1.00</td>\n", " <td>7/18/2020 21:13</td>\n", " <td>Classic</td>\n", " <td>2020.0</td>\n", " <td>3.0</td>\n", " <td>5.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>D</td>\n", " <td>M</td>\n", " <td>19</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>E</td>\n", " <td>PC</td>\n", " <td>Win</td>\n", " <td>4.00</td>\n", " <td>8/17/2020 1:23</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>3.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>E</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>F</td>\n", " <td>iPad</td>\n", " <td>iOS</td>\n", " <td>3.75</td>\n", " <td>6/24/2020 15:23</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>2.0</td>\n", " <td>2.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>F</td>\n", " <td>F</td>\n", " <td>67</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>G</td>\n", " <td>PC</td>\n", " <td>Windows10</td>\n", " <td>3.75</td>\n", " <td>5/1/2020 12:33</td>\n", " <td>VIP</td>\n", " <td>2020.0</td>\n", " <td>2.0</td>\n", " <td>4.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>G</td>\n", " <td>M</td>\n", " <td>28</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>H</td>\n", " <td>iPhone 6</td>\n", " <td>iOS5</td>\n", " <td>2.00</td>\n", " <td>6/12/2020 8:33</td>\n", " <td>Classic</td>\n", " <td>2020.0</td>\n", " <td>2.0</td>\n", " <td>4.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>H</td>\n", " <td>F</td>\n", " <td>34</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>I</td>\n", " <td>F</td>\n", " <td>55</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>J</td>\n", " <td>M</td>\n", " <td>43</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>K</td>\n", " <td>M</td>\n", " <td>29</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " User Device OS Transactions Recent Visit Status Year \\\n", "0 A Mobile Android6 5.00 4/5/2020 11:03 VIP 2020.0 \n", "1 B Mobile Android6 3.00 8/12/2020 23:03 Classic 2020.0 \n", "2 C Mobile iOS 7.00 8/7/2020 10:20 VIP 2020.0 \n", "3 D Tablet Android 1.00 7/18/2020 21:13 Classic 2020.0 \n", "4 E PC Win 4.00 8/17/2020 1:23 VIP 2020.0 \n", "5 F iPad iOS 3.75 6/24/2020 15:23 VIP 2020.0 \n", "6 G PC Windows10 3.75 5/1/2020 12:33 VIP 2020.0 \n", "7 H iPhone 6 iOS5 2.00 6/12/2020 8:33 Classic 2020.0 \n", "8 NaN NaN NaN NaN NaN NaN NaN \n", "9 NaN NaN NaN NaN NaN NaN NaN \n", "10 NaN NaN NaN NaN NaN NaN NaN \n", "\n", " Quarter WeekDay Mobile PC Tablet iPad iPhone 6 Name Gender Age \n", "0 2.0 6.0 1.0 0.0 0.0 0.0 0.0 A F 45 \n", "1 3.0 2.0 1.0 0.0 0.0 0.0 0.0 B M 33 \n", "2 3.0 4.0 1.0 0.0 0.0 0.0 0.0 C F 21 \n", "3 3.0 5.0 0.0 0.0 1.0 0.0 0.0 D M 19 \n", "4 3.0 0.0 0.0 1.0 0.0 0.0 0.0 E F 34 \n", "5 2.0 2.0 0.0 0.0 0.0 1.0 0.0 F F 67 \n", "6 2.0 4.0 0.0 1.0 0.0 0.0 0.0 G M 28 \n", "7 2.0 4.0 0.0 0.0 0.0 0.0 1.0 H F 34 \n", "8 NaN NaN NaN NaN NaN NaN NaN I F 55 \n", "9 NaN NaN NaN NaN NaN NaN NaN J M 43 \n", "10 NaN NaN NaN NaN NaN NaN NaN K M 29 " ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Try outer join, left join and right join to see the difference\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }

user.csv

Name Gender Age
A F 45
B M 33
C F 21
D M 19
E F 34
F F 67
G M 28
H F 34
I F 55
J M 43
K M 29