Lecture 21 – Joining and Row Methods

Data 94, Spring 2021

In [1]:
from datascience import *
import numpy as np

joining

In [2]:
phones = Table().with_columns(
    'Model', np.array(['iPhone 12', 'iPhone 12 Pro Max', 'Samsung Galaxy S21', 'OnePlus 8']),
    'Price', np.array([799, 1099, 799, 699]),
    'Screen Size', np.array([6.1, 6.7, 6.2, 6.6])
)

inventory = Table().with_columns(
    'Handset', np.array(['Samsung Galaxy S21', 'iPhone 12', 'iPhone 12', 'OnePlus 8', 'Pixel 5']),
    'Units', np.array([50, 40, 10, 100, 25]),
    'Store', np.array(['Berkeley', 'Berkeley', 'San Francisco', 'Oakland', 'Oakland'])
)
In [3]:
phones
Out[3]:
Model Price Screen Size
iPhone 12 799 6.1
iPhone 12 Pro Max 1099 6.7
Samsung Galaxy S21 799 6.2
OnePlus 8 699 6.6
In [4]:
inventory
Out[4]:
Handset Units Store
Samsung Galaxy S21 50 Berkeley
iPhone 12 40 Berkeley
iPhone 12 10 San Francisco
OnePlus 8 100 Oakland
Pixel 5 25 Oakland
In [5]:
phones.join('Model', inventory, 'Handset')
Out[5]:
Model Price Screen Size Units Store
OnePlus 8 699 6.6 100 Oakland
Samsung Galaxy S21 799 6.2 50 Berkeley
iPhone 12 799 6.1 40 Berkeley
iPhone 12 799 6.1 10 San Francisco
In [6]:
inventory.join('Handset', phones, 'Model')
Out[6]:
Handset Units Store Price Screen Size
OnePlus 8 100 Oakland 699 6.6
Samsung Galaxy S21 50 Berkeley 799 6.2
iPhone 12 40 Berkeley 799 6.1
iPhone 12 10 San Francisco 799 6.1
In [7]:
store = phones.join('Model', inventory, 'Handset')
store
Out[7]:
Model Price Screen Size Units Store
OnePlus 8 699 6.6 100 Oakland
Samsung Galaxy S21 799 6.2 50 Berkeley
iPhone 12 799 6.1 40 Berkeley
iPhone 12 799 6.1 10 San Francisco
In [8]:
store.column('Price') * store.column('Units')
Out[8]:
array([69900, 39950, 31960,  7990])
In [9]:
# Total value of all of the phones in my inventory (that I know the price of)
np.sum(store.column('Price') * store.column('Units'))
Out[9]:
149800
In [10]:
# Equivalent to the above
np.dot(store.column('Price'), store.column('Units'))
Out[10]:
149800

Quick Check 1

In [11]:
contacts = Table().with_columns(
    'Name', np.array(['Roxanne', 'Sandy', 'Stan', 'Tomas', 'Wilma']),
    'Email', np.array(['roxanne@berkeley.edu', 'sandy@nyu.edu', 'stan.vg@gmail.com', 'tomastrain@umich.edu', 'wilma@columbia.edu']),
    'Area Code', np.array([510, 212, 734, 734, 212]),
)

codes = Table().with_columns(
    'Code', np.array([212, 310, 519, 734]),
    'Region', np.array(['New York City', 'Los Angeles', 'Ontario, Canada', 'Metro Detroit'])
)
In [12]:
contacts
Out[12]:
Name Email Area Code
Roxanne roxanne@berkeley.edu 510
Sandy sandy@nyu.edu 212
Stan stan.vg@gmail.com 734
Tomas tomastrain@umich.edu 734
Wilma wilma@columbia.edu 212
In [13]:
codes
Out[13]:
Code Region
212 New York City
310 Los Angeles
519 Ontario, Canada
734 Metro Detroit
In [ ]:
# contacts.join(___, ___, ___)

Followup

In [14]:
extra_codes = Table().with_columns(
    'Code', np.array([212, 212, 519, 734]),
    'Region', np.array(['New York City', 'Los Angeles', 'Ontario, Canada', 'Metro Detroit'])
)
In [15]:
contacts
Out[15]:
Name Email Area Code
Roxanne roxanne@berkeley.edu 510
Sandy sandy@nyu.edu 212
Stan stan.vg@gmail.com 734
Tomas tomastrain@umich.edu 734
Wilma wilma@columbia.edu 212
In [16]:
extra_codes
Out[16]:
Code Region
212 New York City
212 Los Angeles
519 Ontario, Canada
734 Metro Detroit
In [17]:
contacts.join('Area Code', extra_codes, 'Code')
Out[17]:
Area Code Name Email Region
212 Sandy sandy@nyu.edu New York City
212 Sandy sandy@nyu.edu Los Angeles
212 Wilma wilma@columbia.edu New York City
212 Wilma wilma@columbia.edu Los Angeles
734 Stan stan.vg@gmail.com Metro Detroit
734 Tomas tomastrain@umich.edu Metro Detroit

Disclaimer

In [18]:
# No output – because there are no matches between
# the 'Name' column in contacts and the 'Code' column in codes
contacts.join('Name', codes, 'Code')

Example: grades

In [19]:
roster = Table.read_table('data/roster.csv')
grades = Table.read_table('data/grades.csv')
In [20]:
roster.show()
Name Email SID Grading Option
Montrezl Harrell trez15@lakers.com 11259 PNP
LeBron James kingjames@berkeley.edu 29314 GRD
Kyle Kuzma kuz@utah.edu 29319 GRD
Marc Gasol gasol@spain.co.es 34892 GRD
Dennis Schroder dennis@okc.com 40561 PNP
Kentavious Caldwell-Pope kc3p@pistons.ca 40915 PNP
Talen Horton-Tucker tht@stanford.edu 44003 GRD
Markieff Morris kieff@clippers.co.uk 77143 GRD
Wesley Matthews wesmath@mavs.dallas.com 81920 GRD
Alex Caruso carugoat@gov.on.ca 83212 GRD
Anthony Davis AD_brow@pels.edu 85002 PNP
Jared Dudley jdudz3@berkeley.edu 85923 GRD
In [21]:
grades.show()
SID Homework 1 Homework 2 Quiz 1 Quiz 2 Final
29314 24 18 98 63 88
44003 23 17 87 67 82
83212 23 9 88 58 19
29319 23 10 65 62.5 45
55510 23 15 78 63.5 56
98102 23 16 82 55 90
34892 22 17 83 21 90
40561 24 17 89 70 82.5
85923 21 18 92 70 78.25
85002 20 18 89.5 53 80
11259 18 17.5 88 64 78
81920 23 16 42 65 79
77143 24 15 90 61 90
In [22]:
roster.num_rows
Out[22]:
12
In [23]:
grades.num_rows
Out[23]:
13
In [24]:
grades_merged = roster.join('SID', grades)
grades_merged
Out[24]:
SID Name Email Grading Option Homework 1 Homework 2 Quiz 1 Quiz 2 Final
11259 Montrezl Harrell trez15@lakers.com PNP 18 17.5 88 64 78
29314 LeBron James kingjames@berkeley.edu GRD 24 18 98 63 88
29319 Kyle Kuzma kuz@utah.edu GRD 23 10 65 62.5 45
34892 Marc Gasol gasol@spain.co.es GRD 22 17 83 21 90
40561 Dennis Schroder dennis@okc.com PNP 24 17 89 70 82.5
44003 Talen Horton-Tucker tht@stanford.edu GRD 23 17 87 67 82
77143 Markieff Morris kieff@clippers.co.uk GRD 24 15 90 61 90
81920 Wesley Matthews wesmath@mavs.dallas.com GRD 23 16 42 65 79
83212 Alex Caruso carugoat@gov.on.ca GRD 23 9 88 58 19
85002 Anthony Davis AD_brow@pels.edu PNP 20 18 89.5 53 80

... (1 rows omitted)

In [25]:
grades_merged.num_rows
Out[25]:
11

Let's see if there are any students in the roster who we don't have grades for:

In [26]:
for sid in roster.column('SID'):
    if sid not in grades.column('SID'):
        display(roster.where('SID', sid))
Name Email SID Grading Option
Kentavious Caldwell-Pope kc3p@pistons.ca 40915 PNP

And vice versa:

In [27]:
for sid in grades.column('SID'):
    if sid not in roster.column('SID'):
        display(grades.where('SID', sid))
SID Homework 1 Homework 2 Quiz 1 Quiz 2 Final
55510 23 15 78 63.5 56
SID Homework 1 Homework 2 Quiz 1 Quiz 2 Final
98102 23 16 82 55 90

Whoever the above students are, they're not in our roster. (This happens often when students drop a class.)

Now we can do some grade calculations:

In [28]:
grades_merged
Out[28]:
SID Name Email Grading Option Homework 1 Homework 2 Quiz 1 Quiz 2 Final
11259 Montrezl Harrell trez15@lakers.com PNP 18 17.5 88 64 78
29314 LeBron James kingjames@berkeley.edu GRD 24 18 98 63 88
29319 Kyle Kuzma kuz@utah.edu GRD 23 10 65 62.5 45
34892 Marc Gasol gasol@spain.co.es GRD 22 17 83 21 90
40561 Dennis Schroder dennis@okc.com PNP 24 17 89 70 82.5
44003 Talen Horton-Tucker tht@stanford.edu GRD 23 17 87 67 82
77143 Markieff Morris kieff@clippers.co.uk GRD 24 15 90 61 90
81920 Wesley Matthews wesmath@mavs.dallas.com GRD 23 16 42 65 79
83212 Alex Caruso carugoat@gov.on.ca GRD 23 9 88 58 19
85002 Anthony Davis AD_brow@pels.edu PNP 20 18 89.5 53 80

... (1 rows omitted)

In [29]:
assignment_totals = {
    'Homework 1': 24,
    'Homework 2': 18,
    'Quiz 1': 100,
    'Quiz 2': 70,
    'Final': 90
}
In [30]:
# Divides each assignment column by its denominator in assignment_totals
for assignment in assignment_totals.keys():
    grades_merged = grades_merged.with_columns(
        assignment + ' Percentage', grades_merged.column(assignment) / assignment_totals[assignment]
    )
In [31]:
grades_merged
Out[31]:
SID Name Email Grading Option Homework 1 Homework 2 Quiz 1 Quiz 2 Final Homework 1 Percentage Homework 2 Percentage Quiz 1 Percentage Quiz 2 Percentage Final Percentage
11259 Montrezl Harrell trez15@lakers.com PNP 18 17.5 88 64 78 0.75 0.972222 0.88 0.914286 0.866667
29314 LeBron James kingjames@berkeley.edu GRD 24 18 98 63 88 1 1 0.98 0.9 0.977778
29319 Kyle Kuzma kuz@utah.edu GRD 23 10 65 62.5 45 0.958333 0.555556 0.65 0.892857 0.5
34892 Marc Gasol gasol@spain.co.es GRD 22 17 83 21 90 0.916667 0.944444 0.83 0.3 1
40561 Dennis Schroder dennis@okc.com PNP 24 17 89 70 82.5 1 0.944444 0.89 1 0.916667
44003 Talen Horton-Tucker tht@stanford.edu GRD 23 17 87 67 82 0.958333 0.944444 0.87 0.957143 0.911111
77143 Markieff Morris kieff@clippers.co.uk GRD 24 15 90 61 90 1 0.833333 0.9 0.871429 1
81920 Wesley Matthews wesmath@mavs.dallas.com GRD 23 16 42 65 79 0.958333 0.888889 0.42 0.928571 0.877778
83212 Alex Caruso carugoat@gov.on.ca GRD 23 9 88 58 19 0.958333 0.5 0.88 0.828571 0.211111
85002 Anthony Davis AD_brow@pels.edu PNP 20 18 89.5 53 80 0.833333 1 0.895 0.757143 0.888889

... (1 rows omitted)

Other tools

.row

In [32]:
phones
Out[32]:
Model Price Screen Size
iPhone 12 799 6.1
iPhone 12 Pro Max 1099 6.7
Samsung Galaxy S21 799 6.2
OnePlus 8 699 6.6
In [33]:
phones.row(1)
Out[33]:
Row(Model='iPhone 12 Pro Max', Price=1099, Screen Size=6.7)
In [34]:
type(phones.row(1))
Out[34]:
datascience.tables.Row
In [35]:
phones.row(1).item(1)
Out[35]:
1099
In [36]:
list(phones.row(-1))
Out[36]:
['OnePlus 8', 699, 6.6]

.with_rows

In [37]:
phones
Out[37]:
Model Price Screen Size
iPhone 12 799 6.1
iPhone 12 Pro Max 1099 6.7
Samsung Galaxy S21 799 6.2
OnePlus 8 699 6.6
In [38]:
phones.with_row(['iPhone 12 Mini', 699, 5.8])
Out[38]:
Model Price Screen Size
iPhone 12 799 6.1
iPhone 12 Pro Max 1099 6.7
Samsung Galaxy S21 799 6.2
OnePlus 8 699 6.6
iPhone 12 Mini 699 5.8
In [39]:
phones.with_rows([['iPhone 12 Mini', 699, 5.8],
                  ['Moto RAZR', 459, 3.5]])
Out[39]:
Model Price Screen Size
iPhone 12 799 6.1
iPhone 12 Pro Max 1099 6.7
Samsung Galaxy S21 799 6.2
OnePlus 8 699 6.6
iPhone 12 Mini 699 5.8
Moto RAZR 459 3.5

Quick Check 2

In [40]:
codes
Out[40]:
Code Region
212 New York City
310 Los Angeles
519 Ontario, Canada
734 Metro Detroit
In [ ]: