Lecture 19 – Applying

Data 94, Spring 2021

In [1]:
from datascience import *
import numpy as np

Motivation

In [2]:
pups = Table.read_table('data/pups.csv')
In [3]:
pups
Out[3]:
name age size
Junior Smith 11 medium
Rex Rogers 7 big
Flash Heat 3 big
Reese Bo 4 medium
Polo Cash 2 small
In [4]:
pups.with_columns(
    'human years', pups.column('age') * 7
)
Out[4]:
name age size human years
Junior Smith 11 medium 77
Rex Rogers 7 big 49
Flash Heat 3 big 21
Reese Bo 4 medium 28
Polo Cash 2 small 14

Apply

In [5]:
def seven_times(x):
    return 7 * x
In [6]:
pups.apply(seven_times, 'age')
Out[6]:
array([77, 49, 21, 28, 14])

Note, we wouldn't actually use the above example since we could just write pups.column('age') * 7.

Here's a more useful example:

In [7]:
def email_from_name(name):
    first, last = name.split(' ')
    email = first + '.' + last + '@dogschool.edu'
    return email.lower()
In [8]:
# Can use email_from_name on a single argument
email_from_name('Champ Major')
Out[8]:
'champ.major@dogschool.edu'
In [9]:
pups.apply(email_from_name, 'name')
Out[9]:
array(['junior.smith@dogschool.edu', 'rex.rogers@dogschool.edu',
       'flash.heat@dogschool.edu', 'reese.bo@dogschool.edu',
       'polo.cash@dogschool.edu'], dtype='<U26')
In [10]:
pups.with_columns('email', pups.apply(email_from_name, 'name'))
Out[10]:
name age size email
Junior Smith 11 medium junior.smith@dogschool.edu
Rex Rogers 7 big rex.rogers@dogschool.edu
Flash Heat 3 big flash.heat@dogschool.edu
Reese Bo 4 medium reese.bo@dogschool.edu
Polo Cash 2 small polo.cash@dogschool.edu
In [11]:
# Note, the parameter names don't
# need to be 'age' and 'size'
def human_years_converter(years_old, kind):
    if kind == 'small':
        return years_old * 6
    elif kind == 'medium':
        return years_old * 7
    else:
        return years_old * 8
In [12]:
human_years_converter(11, 'medium')
Out[12]:
77
In [13]:
human_years_converter(11, 'small')
Out[13]:
66
In [14]:
pups.apply(human_years_converter, 'age', 'size')
Out[14]:
array([77, 56, 24, 28, 12])
In [15]:
pups.with_columns('accurate human years', pups.apply(human_years_converter, 'age', 'size'))
Out[15]:
name age size accurate human years
Junior Smith 11 medium 77
Rex Rogers 7 big 56
Flash Heat 3 big 24
Reese Bo 4 medium 28
Polo Cash 2 small 12

Quick Check 1

In [16]:
# Large file – this may take ~10 seconds to load
salary = Table.read_table('https://media.githubusercontent.com/media/dailycal-projects/ucb-faculty-salary/master/data/salary/salary_2015.csv')
salary
Out[16]:
year location first last title gross regular overtime other
2015 Berkeley ANNE AABOE BUS SYS ANL 4 124,454 124,454 0 0
2015 Berkeley DAVID AAKER RECALL FACULTY 2,500 0 0 2,500
2015 Berkeley ELIZABETH ABEL PROF-AY 138,775 138,775 0 0
2015 Berkeley NORMAN ABRAHAMSON ADJ PROF-AY-1/9-B/E/E 19,668 19,668 0 0
2015 Berkeley BARBARA ABRAMS PROF-AY 191,162 169,862 0 21,300
2015 Berkeley JOHN ACZON FINANCIAL SVC ANL 3 83,510 78,510 0 5,000
2015 Berkeley ANTHONY ADAMS RES-FY 9,587 9,587 0 0
2015 Berkeley PENNY HINES ACAD HR ANL 5 126,707 126,707 0 0
2015 Berkeley ANINDITA ADHIKARI SR LECT SOE-AY 107,345 107,345 0 0
2015 Berkeley ILAN ADLER PROF-AY-B/E/E 166,617 151,617 0 15,000

... (281504 rows omitted)

In [17]:
profs = salary.select('first', 'last', 'title', 'gross').where('title', are.containing('PROF'))
profs
Out[17]:
first last title gross
ELIZABETH ABEL PROF-AY 138,775
NORMAN ABRAHAMSON ADJ PROF-AY-1/9-B/E/E 19,668
BARBARA ABRAMS PROF-AY 191,162
ILAN ADLER PROF-AY-B/E/E 166,617
VINOD AGGARWAL PROF-AY 167,525
ALICE AGOGINO PROF-AY-B/E/E 243,259
DAVID ALDOUS PROF-AY 218,666
RONELLE ALEXANDER PROF-AY 167,642
NEZAR ALSAYYAD PROF-AY 210,389
GENEVIEVE AMES ADJ PROF-AY 9,783

... (17564 rows omitted)

Look at the very last row of the output – that gross income doesn't look right.

In [18]:
profs.sort('gross', descending = True)
Out[18]:
first last title gross
STEVEN H APPLEBAUM HS ASSOC CLIN PROF-HCOMP 999,756
JOHN A GLASPY PROF-HCOMP 999,631
FRANK P.K. HSU PROF OF CLIN-HCOMP 998,340
JOHN STUART NELSON PROF-HCOMP 997,975
HANMIN LEE PROF OF CLIN-HCOMP 995,434
DENNIS J SLAMON PROF-HCOMP 991,973
BENJAMIN J ANSELL HS CLIN PROF-HCOMP 991,543
NICHOLAS C SAENZ HS CLIN PROF-HCOMP 991,463
JOSEPH F GRECO HS ASST CLIN PROF-HCOMP 991,458
OMRI Y. MARIAN ACT PROF-AY-LAW 99,997

... (17564 rows omitted)

It's because the entries in the 'gross' column are strings, not integers.

In [19]:
profs.column('gross').item(0)
Out[19]:
'138,775'

Your job is to fix that!

In [ ]:
def fix_income(income):
    return _____

fixed_income = profs.apply(_____, _____)

profs = profs.with_columns(
    'gross', _____
)
In [ ]:
 

Example: end-of-semester grading

In [21]:
grade_bins = {
    'A+': 97,
    'A': 92,
    'B+': 85,
    'B': 79,
    'C+': 74,
    'C': 68,
    'D+': 58,
    'D': 50,
    'F': 0
}
In [22]:
def pct_to_letter(pct):
    for letter in grade_bins.keys():
        if pct >= grade_bins[letter]:
            return letter
In [23]:
pct_to_letter(59)
Out[23]:
'D+'
In [24]:
pct_to_letter(98)
Out[24]:
'A+'
In [25]:
gradebook = Table().with_columns(
    'Name', np.array(['Carrera', 'Panamera', 'Taycan', 'Cayenne', 'Macan', 'Cayman', 'Boxster']),
    'Grading Option', np.array(['GRD', 'PNP', 'PNP', 'GRD', 'GRD', 'GRD', 'PNP']),
    'Score', np.array([98, 86, 67.5, 45, 82, 88, 71])
)
In [26]:
gradebook
Out[26]:
Name Grading Option Score
Carrera GRD 98
Panamera PNP 86
Taycan PNP 67.5
Cayenne GRD 45
Macan GRD 82
Cayman GRD 88
Boxster PNP 71
In [27]:
gradebook.apply(pct_to_letter, 'Score')
Out[27]:
array(['A+', 'B+', 'D+', 'F', 'B', 'B+', 'C'], dtype='<U2')

What if we want to factor in grading options?

In [28]:
def pct_to_letter_option(pct, option):
    # If the student is enrolled for a letter grade
    # call our function pct_to_letter
    if option == 'GRD':
        return pct_to_letter(pct)
    # Otherwise, check to see if they have at least a C-
    # (C here because our bins don't have a C-)
    else:
        if pct >= grade_bins['C']:
            return 'P'
        else:
            return 'NP'
In [29]:
gradebook.apply(pct_to_letter_option, 'Score', 'Grading Option')
Out[29]:
array(['A+', 'P', 'NP', 'F', 'B', 'B+', 'P'], dtype='<U2')
In [30]:
gradebook = gradebook.with_columns(
    'Letter Grade', gradebook.apply(pct_to_letter_option, 'Score', 'Grading Option')
)
In [31]:
gradebook
Out[31]:
Name Grading Option Score Letter Grade
Carrera GRD 98 A+
Panamera PNP 86 P
Taycan PNP 67.5 NP
Cayenne GRD 45 F
Macan GRD 82 B
Cayman GRD 88 B+
Boxster PNP 71 P

Masking

In [32]:
numbers = np.array([15, 14, -2, 1, 9])
In [33]:
numbers[[True, False, False, True, False]]
Out[33]:
array([15,  1])
In [34]:
gradebook
Out[34]:
Name Grading Option Score Letter Grade
Carrera GRD 98 A+
Panamera PNP 86 P
Taycan PNP 67.5 NP
Cayenne GRD 45 F
Macan GRD 82 B
Cayman GRD 88 B+
Boxster PNP 71 P
In [35]:
gradebook.where([True, False, False, True, True, False, False])
Out[35]:
Name Grading Option Score Letter Grade
Carrera GRD 98 A+
Cayenne GRD 45 F
Macan GRD 82 B

Example: countries

Run the following cell – ignore the lambda parts:

In [36]:
countries = Table.read_table('data/countries.csv')
countries = countries.relabeled('Country(or dependent territory)', 'Country') \
           .relabeled('% of world', '%') \
           .relabeled('Source(official or UN)', 'Source')
countries = countries.with_columns(
    'Country', countries.apply(lambda s: s[:s.index('[')].lower() if '[' in s else s.lower(), 'Country'),
    'Population', countries.apply(lambda i: int(i.replace(',', '')), 'Population'),
    '%', countries.apply(lambda f: float(f.replace('%', '')), '%')
)
In [37]:
countries
Out[37]:
Rank Country Population % Date Source
1 china 1405936040 17.9 27 Dec 2020 National population clock[3]
2 india 1371366679 17.5 27 Dec 2020 National population clock[4]
3 united states 330888778 4.22 27 Dec 2020 National population clock[5]
4 indonesia 269603400 3.44 1 Jul 2020 National annual projection[6]
5 pakistan 220892331 2.82 1 Jul 2020 UN Projection[2]
6 brazil 212523810 2.71 27 Dec 2020 National population clock[7]
7 nigeria 206139587 2.63 1 Jul 2020 UN Projection[2]
8 bangladesh 169885314 2.17 27 Dec 2020 National population clock[8]
9 russia 146748590 1.87 1 Jan 2020 National annual estimate[9]
10 mexico 127792286 1.63 1 Jul 2020 National annual projection[10]

... (232 rows omitted)

In [38]:
def starts_or_ends_with_a(name):
    return name[0] == 'a' or name[-1] == 'a'
In [39]:
countries.apply(starts_or_ends_with_a, 'Country')
Out[39]:
array([ True,  True, False,  True, False, False,  True, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False,  True, False,  True, False,  True,
        True,  True, False,  True,  True, False, False,  True, False,
       False,  True, False, False,  True,  True,  True, False,  True,
        True, False, False, False,  True, False, False,  True,  True,
       False, False, False,  True, False, False, False,  True, False,
       False,  True, False, False,  True,  True, False, False,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False,  True,  True,  True,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False,  True, False, False, False,  True, False,
       False,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True, False,  True, False, False, False,
        True, False,  True, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False,  True,  True, False,
       False, False, False,  True,  True, False, False,  True, False,
        True, False,  True, False,  True,  True, False, False, False,
        True, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False])
In [40]:
countries.where(countries.apply(starts_or_ends_with_a, 'Country'))
Out[40]:
Rank Country Population % Date Source
1 china 1405936040 17.9 27 Dec 2020 National population clock[3]
2 india 1371366679 17.5 27 Dec 2020 National population clock[4]
4 indonesia 269603400 3.44 1 Jul 2020 National annual projection[6]
7 nigeria 206139587 2.63 1 Jul 2020 UN Projection[2]
9 russia 146748590 1.87 1 Jan 2020 National annual estimate[9]
13 ethiopia 109612120 1.4 1 Jul 2020 National annual projection[13]
23 south africa 59622350 0.761 1 Jul 2020 National annual estimate[23]
25 tanzania 57637628 0.735 1 Jul 2020 National annual projection[25]
27 south korea 51834302 0.661 1 Nov 2020 Monthly national estimate[27]
28 colombia 50372424 0.643 30 Jun 2020 National annual projection[28]

... (72 rows omitted)