= [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
list1 = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
list2
= []
summed for i in range(len(list1)):
+ list2[i])
summed.append(list1[i] summed
[9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
Fast computation using vectors and matrices
= [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
list1 = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
list2
= []
summed for i in range(len(list1)):
+ list2[i])
summed.append(list1[i] summed
[9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
import numpy as np
= np.array(list1)
a = np.array(list2)
b a, b
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0]))
+ b a
array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])
* b a
array([ 0, 8, 14, 18, 20, 20, 18, 14, 8, 0])
- 10 a
array([-10, -9, -8, -7, -6, -5, -4, -3, -2, -1])
sum() a.
45
a.mean()
4.5
= [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
list_of_lists list_of_lists
[[1, 2, 3], [4, 5, 6], [7, 8, 9]]
1][1] list_of_lists[
5
= np.array(list_of_lists)
matrix matrix
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
1][1] # not efficient matrix[
5
1, 1] # efficient matrix[
5
- 10 matrix
array([[-9, -8, -7],
[-6, -5, -4],
[-3, -2, -1]])
sum() matrix.
45
= [[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]]]
list_of_lists_of_lists list_of_lists_of_lists
[[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]]]
= np.array(list_of_lists_of_lists)
tensor tensor
array([[[1, 2, 3],
[4, 5, 6],
[7, 8, 9]],
[[1, 2, 3],
[4, 5, 6],
[7, 8, 9]]])
1, 1, 1] tensor[
5
Fast computations on data tables (on top of Numpy).
import pandas as pd
= pd.DataFrame({'name': ['Mike', 'Mia', 'Jake'], 'weight': [82, 62, 75]})
df df
name | weight | |
---|---|---|
0 | Mike | 82 |
1 | Mia | 62 |
2 | Jake | 75 |
type(df)
pandas.core.frame.DataFrame
= pd.DataFrame(dict(name=['Mike', 'Mia', 'Jake'], weight=[82, 62, 75]))
df df
name | weight | |
---|---|---|
0 | Mike | 82 |
1 | Mia | 62 |
2 | Jake | 75 |
= [('Mike', 82), ('Mia', 62), ('Jake', 75)]
records
= pd.DataFrame().from_records(records, columns=['age', 'weight'])
df df
age | weight | |
---|---|---|
0 | Mike | 82 |
1 | Mia | 62 |
2 | Jake | 75 |
df.index
RangeIndex(start=0, stop=3, step=1)
df.index.values
array([0, 1, 2])
df.columns
Index(['age', 'weight'], dtype='object')
df.dtypes
age object
weight int64
dtype: object
Add a column to an existing dataframe:
'height'] = [182.5, 173.0, 192.5]
df[ df
age | weight | height | |
---|---|---|---|
0 | Mike | 82 | 182.5 |
1 | Mia | 62 | 173.0 |
2 | Jake | 75 | 192.5 |
Add another, categorical, column:
'sex'] = pd.Categorical(['male', 'female', 'male'], categories=['female', 'male'], ordered=True)
df[ df
age | weight | height | sex | |
---|---|---|---|---|
0 | Mike | 82 | 182.5 | male |
1 | Mia | 62 | 173.0 | female |
2 | Jake | 75 | 192.5 | male |
df.dtypes
age object
weight int64
height float64
sex category
dtype: object
A Series just wraps an array:
df.height.to_numpy()
array([182.5, 173. , 192.5])
import seaborn as sns
= sns.load_dataset('penguins') penguins
penguins
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
... | ... | ... | ... | ... | ... | ... | ... |
339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN |
340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female |
341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male |
342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female |
343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male |
344 rows × 7 columns
penguins.dtypes
species object
island object
bill_length_mm float64
bill_depth_mm float64
flipper_length_mm float64
body_mass_g float64
sex object
dtype: object
penguins.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
penguins.tail()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN |
340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female |
341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male |
342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female |
343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male |
'flipper_length_mm'] penguins[
0 181.0
1 186.0
2 195.0
3 NaN
4 193.0
...
339 NaN
340 215.0
341 222.0
342 212.0
343 213.0
Name: flipper_length_mm, Length: 344, dtype: float64
penguins.flipper_length_mm
0 181.0
1 186.0
2 195.0
3 NaN
4 193.0
...
339 NaN
340 215.0
341 222.0
342 212.0
343 213.0
Name: flipper_length_mm, Length: 344, dtype: float64
type(penguins.flipper_length_mm)
pandas.core.series.Series
- 1000 penguins.bill_depth_mm
0 -981.3
1 -982.6
2 -982.0
3 NaN
4 -980.7
...
339 NaN
340 -985.7
341 -984.3
342 -985.2
343 -983.9
Name: bill_depth_mm, Length: 344, dtype: float64
* penguins.flipper_length_mm penguins.bill_depth_mm
0 3384.7
1 3236.4
2 3510.0
3 NaN
4 3724.9
...
339 NaN
340 3074.5
341 3485.4
342 3137.6
343 3429.3
Length: 344, dtype: float64
4, 'island'] penguins.loc[
'Torgersen'
4] penguins.loc[
species Adelie
island Torgersen
bill_length_mm 36.7
bill_depth_mm 19.3
flipper_length_mm 193.0
body_mass_g 3450.0
sex Female
Name: 4, dtype: object
'bill_depth_mm'] penguins[
0 18.7
1 17.4
2 18.0
3 NaN
4 19.3
...
339 NaN
340 14.3
341 15.7
342 14.8
343 16.1
Name: bill_depth_mm, Length: 344, dtype: float64
penguins.bill_depth_mm
0 18.7
1 17.4
2 18.0
3 NaN
4 19.3
...
339 NaN
340 14.3
341 15.7
342 14.8
343 16.1
Name: bill_depth_mm, Length: 344, dtype: float64
40:45, ['island', 'body_mass_g']] penguins.loc[
island | body_mass_g | |
---|---|---|
40 | Dream | 3150.0 |
41 | Dream | 3900.0 |
42 | Dream | 3100.0 |
43 | Dream | 4400.0 |
44 | Dream | 3000.0 |
45 | Dream | 4600.0 |
= penguins.bill_length_mm > 55
idx idx
0 False
1 False
2 False
3 False
4 False
...
339 False
340 False
341 False
342 False
343 False
Name: bill_length_mm, Length: 344, dtype: bool
penguins.loc[idx]
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
169 | Chinstrap | Dream | 58.0 | 17.8 | 181.0 | 3700.0 | Female |
215 | Chinstrap | Dream | 55.8 | 19.8 | 207.0 | 4000.0 | Male |
253 | Gentoo | Biscoe | 59.6 | 17.0 | 230.0 | 6050.0 | Male |
321 | Gentoo | Biscoe | 55.9 | 17.0 | 228.0 | 5600.0 | Male |
335 | Gentoo | Biscoe | 55.1 | 16.0 | 230.0 | 5850.0 | Male |
> 55) & (penguins.sex == 'Female')] penguins.loc[(penguins.bill_length_mm
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
169 | Chinstrap | Dream | 58.0 | 17.8 | 181.0 | 3700.0 | Female |
penguins
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
... | ... | ... | ... | ... | ... | ... | ... |
339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN |
340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female |
341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male |
342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female |
343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male |
344 rows × 7 columns
= penguins.set_index(['species', 'sex', 'island'])
df 10) df.head(
bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | |||
---|---|---|---|---|---|---|
species | sex | island | ||||
Adelie | Male | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 |
Female | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | |
Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | ||
NaN | Torgersen | NaN | NaN | NaN | NaN | |
Female | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | |
Male | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | |
Female | Torgersen | 38.9 | 17.8 | 181.0 | 3625.0 | |
Male | Torgersen | 39.2 | 19.6 | 195.0 | 4675.0 | |
NaN | Torgersen | 34.1 | 18.1 | 193.0 | 3475.0 | |
Torgersen | 42.0 | 20.2 | 190.0 | 4250.0 |
df.reset_index()
species | sex | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | |
---|---|---|---|---|---|---|---|
0 | Adelie | Male | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 |
1 | Adelie | Female | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 |
2 | Adelie | Female | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 |
3 | Adelie | NaN | Torgersen | NaN | NaN | NaN | NaN |
4 | Adelie | Female | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 |
... | ... | ... | ... | ... | ... | ... | ... |
339 | Gentoo | NaN | Biscoe | NaN | NaN | NaN | NaN |
340 | Gentoo | Female | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 |
341 | Gentoo | Male | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 |
342 | Gentoo | Female | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 |
343 | Gentoo | Male | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 |
344 rows × 7 columns
= penguins.sort_values(by="bill_length_mm")
sorted_df sorted_df.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
142 | Adelie | Dream | 32.1 | 15.5 | 188.0 | 3050.0 | Female |
98 | Adelie | Dream | 33.1 | 16.1 | 178.0 | 2900.0 | Female |
70 | Adelie | Torgersen | 33.5 | 19.0 | 190.0 | 3600.0 | Female |
92 | Adelie | Dream | 34.0 | 17.1 | 185.0 | 3400.0 | Female |
8 | Adelie | Torgersen | 34.1 | 18.1 | 193.0 | 3475.0 | NaN |
sorted_df.index.values
array([142, 98, 70, 92, 8, 18, 54, 80, 14, 100, 52, 83, 124,
25, 66, 74, 136, 60, 90, 118, 68, 22, 42, 48, 150, 148,
78, 94, 120, 86, 34, 64, 58, 40, 15, 147, 4, 82, 132,
87, 44, 138, 77, 31, 144, 117, 84, 47, 133, 62, 38, 59,
21, 121, 102, 103, 10, 20, 11, 149, 104, 28, 96, 108, 134,
110, 107, 23, 88, 130, 13, 106, 116, 16, 24, 126, 36, 89,
6, 128, 145, 56, 0, 35, 146, 7, 5, 30, 1, 32, 114,
45, 50, 72, 93, 112, 105, 139, 71, 39, 51, 137, 140, 122,
97, 2, 27, 29, 125, 141, 26, 57, 143, 95, 41, 230, 182,
33, 76, 101, 135, 119, 46, 63, 91, 67, 12, 61, 85, 123,
55, 127, 151, 65, 326, 69, 236, 53, 9, 79, 113, 37, 49,
172, 184, 206, 17, 256, 115, 260, 75, 251, 81, 244, 131, 278,
109, 174, 99, 228, 328, 306, 216, 332, 288, 265, 276, 258, 129,
43, 257, 246, 336, 314, 268, 304, 275, 241, 252, 272, 208, 298,
299, 269, 342, 157, 280, 262, 226, 155, 232, 277, 195, 312, 266,
111, 211, 214, 204, 282, 73, 284, 234, 166, 160, 19, 158, 245,
220, 286, 238, 334, 281, 193, 243, 170, 180, 291, 294, 293, 225,
274, 152, 242, 162, 270, 227, 176, 325, 229, 340, 213, 317, 190,
164, 338, 322, 324, 250, 302, 310, 296, 187, 308, 188, 224, 290,
247, 329, 202, 248, 292, 318, 233, 255, 271, 173, 320, 295, 259,
239, 222, 337, 192, 199, 231, 300, 323, 254, 171, 237, 235, 209,
316, 313, 179, 287, 263, 261, 217, 186, 331, 201, 285, 343, 303,
153, 221, 223, 249, 198, 273, 210, 219, 240, 168, 279, 341, 267,
330, 167, 178, 264, 175, 289, 205, 305, 218, 197, 315, 196, 194,
185, 297, 319, 154, 159, 161, 307, 203, 333, 200, 163, 212, 165,
177, 189, 309, 207, 311, 301, 156, 181, 327, 191, 183, 283, 335,
215, 321, 169, 253, 3, 339])
Click to the left of an output cell to enable/disable scrolling of the output (usefull for large amounts of output).
0] sorted_df.loc[
species Adelie
island Torgersen
bill_length_mm 39.1
bill_depth_mm 18.7
flipper_length_mm 181.0
body_mass_g 3750.0
sex Male
Name: 0, dtype: object
0] sorted_df.flipper_length_mm[
181.0
0] # iloc !!! sorted_df.iloc[
species Adelie
island Dream
bill_length_mm 32.1
bill_depth_mm 15.5
flipper_length_mm 188.0
body_mass_g 3050.0
sex Female
Name: 142, dtype: object
0] sorted_df.flipper_length_mm.iloc[
188.0
penguins.describe()
bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | |
---|---|---|---|---|
count | 342.000000 | 342.000000 | 342.000000 | 342.000000 |
mean | 43.921930 | 17.151170 | 200.915205 | 4201.754386 |
std | 5.459584 | 1.974793 | 14.061714 | 801.954536 |
min | 32.100000 | 13.100000 | 172.000000 | 2700.000000 |
25% | 39.225000 | 15.600000 | 190.000000 | 3550.000000 |
50% | 44.450000 | 17.300000 | 197.000000 | 4050.000000 |
75% | 48.500000 | 18.700000 | 213.000000 | 4750.000000 |
max | 59.600000 | 21.500000 | 231.000000 | 6300.000000 |
penguins.bill_length_mm.mean()
43.9219298245614
penguins.bill_length_mm.count()
342
'island') penguins.groupby(
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x15cc9e5e0>
Aggregating produces a single value for each variable in each group:
Means for all numeric variables for each island:
'island').aggregate("mean", numeric_only=True) penguins.groupby(
bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | |
---|---|---|---|---|
island | ||||
Biscoe | 45.257485 | 15.874850 | 209.706587 | 4716.017964 |
Dream | 44.167742 | 18.344355 | 193.072581 | 3712.903226 |
Torgersen | 38.950980 | 18.429412 | 191.196078 | 3706.372549 |
'island').mean(numeric_only=True) penguins.groupby(
bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | |
---|---|---|---|---|
island | ||||
Biscoe | 45.257485 | 15.874850 | 209.706587 | 4716.017964 |
Dream | 44.167742 | 18.344355 | 193.072581 | 3712.903226 |
Torgersen | 38.950980 | 18.429412 | 191.196078 | 3706.372549 |
Means for bill_length_mm
and flipper_length_mm
:
'island')[['bill_length_mm', 'flipper_length_mm']].mean() penguins.groupby(
bill_length_mm | flipper_length_mm | |
---|---|---|
island | ||
Biscoe | 45.257485 | 209.706587 |
Dream | 44.167742 | 193.072581 |
Torgersen | 38.950980 | 191.196078 |
Just for flipper_length_mm
:
'island').flipper_length_mm.mean() penguins.groupby(
island
Biscoe 209.706587
Dream 193.072581
Torgersen 191.196078
Name: flipper_length_mm, dtype: float64
Transforming produces new colums with the same length as the input:
'island')[['bill_length_mm', 'flipper_length_mm']].transform("mean") penguins.groupby(
bill_length_mm | flipper_length_mm | |
---|---|---|
0 | 38.950980 | 191.196078 |
1 | 38.950980 | 191.196078 |
2 | 38.950980 | 191.196078 |
3 | 38.950980 | 191.196078 |
4 | 38.950980 | 191.196078 |
... | ... | ... |
339 | 45.257485 | 209.706587 |
340 | 45.257485 | 209.706587 |
341 | 45.257485 | 209.706587 |
342 | 45.257485 | 209.706587 |
343 | 45.257485 | 209.706587 |
344 rows × 2 columns
def z_value(sr):
return (sr - sr.mean()) / sr.std()
'island')[['bill_length_mm', 'flipper_length_mm']].transform(z_value) penguins.groupby(
bill_length_mm | flipper_length_mm | |
---|---|---|
0 | 0.049258 | -1.636022 |
1 | 0.181475 | -0.833742 |
2 | 0.445910 | 0.610362 |
3 | NaN | NaN |
4 | -0.744048 | 0.289450 |
... | ... | ... |
339 | NaN | NaN |
340 | 0.323193 | 0.374297 |
341 | 1.077478 | 0.869267 |
342 | -0.012044 | 0.162167 |
343 | 0.972717 | 0.232877 |
344 rows × 2 columns
Flexible method allowing any operation on grouped data.
Return a single value:
def fun(df):
return df.bill_length_mm + df.flipper_length_mm.mean() / df.body_mass_g
'island').apply(fun)#.to_frame('my_stat') penguins.groupby(
island
Biscoe 20 37.861678
21 37.758252
22 35.955186
23 38.253090
24 38.855186
...
Torgersen 127 41.544464
128 39.062687
129 44.147799
130 38.557503
131 43.154627
Length: 344, dtype: float64
Return a dataframe:
def fun(df):
return pd.DataFrame({'sqrt_bill': np.sqrt(df.bill_length_mm),
'bill_squared': df.bill_length_mm**2})
'island').apply(fun) penguins.groupby(
sqrt_bill | bill_squared | ||
---|---|---|---|
island | |||
Biscoe | 20 | 6.148170 | 1428.84 |
21 | 6.140033 | 1421.29 | |
22 | 5.991661 | 1288.81 | |
23 | 6.180615 | 1459.24 | |
24 | 6.228965 | 1505.44 | |
... | ... | ... | ... |
Torgersen | 127 | 6.442049 | 1722.25 |
128 | 6.244998 | 1521.00 | |
129 | 6.640783 | 1944.81 | |
130 | 6.204837 | 1482.25 | |
131 | 6.565059 | 1857.61 |
344 rows × 2 columns