Lecture 19 – Applying¶

Data 94, Spring 2021¶

from datascience import *
import numpy as np

Motivation¶

pups = Table.read_table('data/pups.csv')

pups

pups.with_columns(
    'human years', pups.column('age') * 7
)

Apply¶

def seven_times(x):
    return 7 * x

pups.apply(seven_times, 'age')

array([77, 49, 21, 28, 14])

Note, we wouldn't actually use the above example since we could just write pups.column('age') * 7.

Here's a more useful example:

def email_from_name(name):
    first, last = name.split(' ')
    email = first + '.' + last + '@dogschool.edu'
    return email.lower()

# Can use email_from_name on a single argument
email_from_name('Champ Major')

'champ.major@dogschool.edu'

pups.apply(email_from_name, 'name')

array(['junior.smith@dogschool.edu', 'rex.rogers@dogschool.edu',
       'flash.heat@dogschool.edu', 'reese.bo@dogschool.edu',
       'polo.cash@dogschool.edu'], dtype='<U26')

pups.with_columns('email', pups.apply(email_from_name, 'name'))

# Note, the parameter names don't
# need to be 'age' and 'size'
def human_years_converter(years_old, kind):
    if kind == 'small':
        return years_old * 6
    elif kind == 'medium':
        return years_old * 7
    else:
        return years_old * 8

human_years_converter(11, 'medium')

77

human_years_converter(11, 'small')

66

pups.apply(human_years_converter, 'age', 'size')

array([77, 56, 24, 28, 12])

pups.with_columns('accurate human years', pups.apply(human_years_converter, 'age', 'size'))

Quick Check 1¶

# Large file – this may take ~10 seconds to load
salary = Table.read_table('https://media.githubusercontent.com/media/dailycal-projects/ucb-faculty-salary/master/data/salary/salary_2015.csv')
salary

profs = salary.select('first', 'last', 'title', 'gross').where('title', are.containing('PROF'))
profs

Look at the very last row of the output – that gross income doesn't look right.

profs.sort('gross', descending = True)

It's because the entries in the 'gross' column are strings, not integers.

profs.column('gross').item(0)

'138,775'

Your job is to fix that!

def fix_income(income):
    return _____

fixed_income = profs.apply(_____, _____)

profs = profs.with_columns(
    'gross', _____
)

Example: end-of-semester grading¶

grade_bins = {
    'A+': 97,
    'A': 92,
    'B+': 85,
    'B': 79,
    'C+': 74,
    'C': 68,
    'D+': 58,
    'D': 50,
    'F': 0
}

def pct_to_letter(pct):
    for letter in grade_bins.keys():
        if pct >= grade_bins[letter]:
            return letter

pct_to_letter(59)

'D+'

pct_to_letter(98)

'A+'

gradebook = Table().with_columns(
    'Name', np.array(['Carrera', 'Panamera', 'Taycan', 'Cayenne', 'Macan', 'Cayman', 'Boxster']),
    'Grading Option', np.array(['GRD', 'PNP', 'PNP', 'GRD', 'GRD', 'GRD', 'PNP']),
    'Score', np.array([98, 86, 67.5, 45, 82, 88, 71])
)

gradebook

gradebook.apply(pct_to_letter, 'Score')

array(['A+', 'B+', 'D+', 'F', 'B', 'B+', 'C'], dtype='<U2')

What if we want to factor in grading options?

def pct_to_letter_option(pct, option):
    # If the student is enrolled for a letter grade
    # call our function pct_to_letter
    if option == 'GRD':
        return pct_to_letter(pct)
    # Otherwise, check to see if they have at least a C-
    # (C here because our bins don't have a C-)
    else:
        if pct >= grade_bins['C']:
            return 'P'
        else:
            return 'NP'

gradebook.apply(pct_to_letter_option, 'Score', 'Grading Option')

array(['A+', 'P', 'NP', 'F', 'B', 'B+', 'P'], dtype='<U2')

gradebook = gradebook.with_columns(
    'Letter Grade', gradebook.apply(pct_to_letter_option, 'Score', 'Grading Option')
)

gradebook

Masking¶

numbers = np.array([15, 14, -2, 1, 9])

numbers[[True, False, False, True, False]]

array([15,  1])

gradebook

gradebook.where([True, False, False, True, True, False, False])

Example: countries¶

Run the following cell – ignore the lambda parts:

countries = Table.read_table('data/countries.csv')
countries = countries.relabeled('Country(or dependent territory)', 'Country') \
           .relabeled('% of world', '%') \
           .relabeled('Source(official or UN)', 'Source')
countries = countries.with_columns(
    'Country', countries.apply(lambda s: s[:s.index('[')].lower() if '[' in s else s.lower(), 'Country'),
    'Population', countries.apply(lambda i: int(i.replace(',', '')), 'Population'),
    '%', countries.apply(lambda f: float(f.replace('%', '')), '%')
)

countries

def starts_or_ends_with_a(name):
    return name[0] == 'a' or name[-1] == 'a'

countries.apply(starts_or_ends_with_a, 'Country')

array([ True,  True, False,  True, False, False,  True, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False,  True, False,  True, False,  True,
        True,  True, False,  True,  True, False, False,  True, False,
       False,  True, False, False,  True,  True,  True, False,  True,
        True, False, False, False,  True, False, False,  True,  True,
       False, False, False,  True, False, False, False,  True, False,
       False,  True, False, False,  True,  True, False, False,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False,  True,  True,  True,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False,  True, False, False, False,  True, False,
       False,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True, False,  True, False, False, False,
        True, False,  True, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False,  True,  True, False,
       False, False, False,  True,  True, False, False,  True, False,
        True, False,  True, False,  True,  True, False, False, False,
        True, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False])

countries.where(countries.apply(starts_or_ends_with_a, 'Country'))

year	location	first	last	title	gross	regular	other
2015	Berkeley	ANNE	AABOE	BUS SYS ANL 4	124,454	124,454	0
2015	Berkeley	DAVID	AAKER	RECALL FACULTY	2,500	0	2,500
2015	Berkeley	ELIZABETH	ABEL	PROF-AY	138,775	138,775	0
2015	Berkeley	NORMAN	ABRAHAMSON	ADJ PROF-AY-1/9-B/E/E	19,668	19,668	0
2015	Berkeley	BARBARA	ABRAMS	PROF-AY	191,162	169,862	21,300
2015	Berkeley	JOHN	ACZON	FINANCIAL SVC ANL 3	83,510	78,510	5,000
2015	Berkeley	ANTHONY	ADAMS	RES-FY	9,587	9,587	0
2015	Berkeley	PENNY	HINES	ACAD HR ANL 5	126,707	126,707	0
2015	Berkeley	ANINDITA	ADHIKARI	SR LECT SOE-AY	107,345	107,345	0
2015	Berkeley	ILAN	ADLER	PROF-AY-B/E/E	166,617	151,617	15,000

first	last	title	gross
ELIZABETH	ABEL	PROF-AY	138,775
NORMAN	ABRAHAMSON	ADJ PROF-AY-1/9-B/E/E	19,668
BARBARA	ABRAMS	PROF-AY	191,162
ILAN	ADLER	PROF-AY-B/E/E	166,617
VINOD	AGGARWAL	PROF-AY	167,525
ALICE	AGOGINO	PROF-AY-B/E/E	243,259
DAVID	ALDOUS	PROF-AY	218,666
RONELLE	ALEXANDER	PROF-AY	167,642
NEZAR	ALSAYYAD	PROF-AY	210,389
GENEVIEVE	AMES	ADJ PROF-AY	9,783

first	last	title	gross
STEVEN H	APPLEBAUM	HS ASSOC CLIN PROF-HCOMP	999,756
JOHN A	GLASPY	PROF-HCOMP	999,631
FRANK P.K.	HSU	PROF OF CLIN-HCOMP	998,340
JOHN STUART	NELSON	PROF-HCOMP	997,975
HANMIN	LEE	PROF OF CLIN-HCOMP	995,434
DENNIS J	SLAMON	PROF-HCOMP	991,973
BENJAMIN J	ANSELL	HS CLIN PROF-HCOMP	991,543
NICHOLAS C	SAENZ	HS CLIN PROF-HCOMP	991,463
JOSEPH F	GRECO	HS ASST CLIN PROF-HCOMP	991,458
OMRI Y.	MARIAN	ACT PROF-AY-LAW	99,997

Rank	Country	Population	%	Date	Source
1	china	1405936040	17.9	27 Dec 2020	National population clock[3]
2	india	1371366679	17.5	27 Dec 2020	National population clock[4]
3	united states	330888778	4.22	27 Dec 2020	National population clock[5]
4	indonesia	269603400	3.44	1 Jul 2020	National annual projection[6]
5	pakistan	220892331	2.82	1 Jul 2020	UN Projection[2]
6	brazil	212523810	2.71	27 Dec 2020	National population clock[7]
7	nigeria	206139587	2.63	1 Jul 2020	UN Projection[2]
8	bangladesh	169885314	2.17	27 Dec 2020	National population clock[8]
9	russia	146748590	1.87	1 Jan 2020	National annual estimate[9]
10	mexico	127792286	1.63	1 Jul 2020	National annual projection[10]

Rank	Country	Population	%	Date	Source
1	china	1405936040	17.9	27 Dec 2020	National population clock[3]
2	india	1371366679	17.5	27 Dec 2020	National population clock[4]
4	indonesia	269603400	3.44	1 Jul 2020	National annual projection[6]
7	nigeria	206139587	2.63	1 Jul 2020	UN Projection[2]
9	russia	146748590	1.87	1 Jan 2020	National annual estimate[9]
13	ethiopia	109612120	1.4	1 Jul 2020	National annual projection[13]
23	south africa	59622350	0.761	1 Jul 2020	National annual estimate[23]
25	tanzania	57637628	0.735	1 Jul 2020	National annual projection[25]
27	south korea	51834302	0.661	1 Nov 2020	Monthly national estimate[27]
28	colombia	50372424	0.643	30 Jun 2020	National annual projection[28]

name	age	size
Junior Smith	11	medium
Rex Rogers	7	big
Flash Heat	3	big
Reese Bo	4	medium
Polo Cash	2	small

name	age	size	email
Junior Smith	11	medium	junior.smith@dogschool.edu
Rex Rogers	7	big	rex.rogers@dogschool.edu
Flash Heat	3	big	flash.heat@dogschool.edu
Reese Bo	4	medium	reese.bo@dogschool.edu
Polo Cash	2	small	polo.cash@dogschool.edu

Name	Grading Option	Score
Carrera	GRD	98
Panamera	PNP	86
Taycan	PNP	67.5
Cayenne	GRD	45
Macan	GRD	82
Cayman	GRD	88
Boxster	PNP	71