Creating a dataframe
A dataframe is a 2d table-like structure, similar to a database table.
import pandas as pd
columns = ["product_id", "price"]
data = [
[1, 50],
[2, 55],
[3, 30],
[4, 32],
[5, 44],
]
frame = pd.DataFrame(data, columns=columns)
print(frame)
# Output
#
# product_id price
# 0 1 50
# 1 2 55
# 2 3 30
# 3 4 32
# 4 5 44
Dataframe size
The shape
property of the DataFrame
object holds information about number of rows and columns in the dataframe.
import pandas as pd
columns = ["product_id", "price"]
data = [
[1, 50],
[2, 55],
[3, 30],
[4, 32],
[5, 44],
]
frame = pd.DataFrame(data, columns=columns)
rows = frame.shape[0]
cols = frame.shape[1]
print("row: {0}, columns: {1}".format(rows, cols))
# row: 5, columns: 2
Slicing a dataframe
import pandas as pd
columns = ["product_id", "price"]
data = [
[1, 50],
[2, 55],
[3, 30],
[4, 32],
[5, 44],
]
frame = pd.DataFrame(data, columns=columns)
## first two rows
frame[0:2]
# product_id price
# 0 1 50
# 1 2 55
## also first two rows
frame.head(2)
# product_id price
# 0 1 50
# 1 2 55
## last two rows
frame[-2:]
# product_id price
# 3 4 32
# 4 5 44
## also last two rows
frame.tail(2)
# product_id price
# 3 4 32
# 4 5 44
## from the second to before last
frame[1:-1]
# product_id price
# 1 2 55
# 2 3 30
# 3 4 32
Selecting/searching data
The property loc
allows to search and select data inside the frame.
import pandas as pd
frame = pd.DataFrame(
[
[1, 50],
[2, 55],
[3, 30],
[4, 32],
[5, 44],
],
columns=["product_id", "price"],
)
## Search for product_id == 2
frame.loc[frame["product_id"] == 2]
# product_id price
# 1 2 55
## Search for product_id == 2 and select the price
frame.loc[frame["product_id"] == 2, ["price"]]
# price
# 1 55
Creating a new column
This example creates a new column called “cost”.
import pandas as pd
frame = pd.DataFrame(
[
[1, 50],
[2, 55],
[3, 30],
[4, 32],
[5, 44],
],
columns=["product_id", "price"],
)
margin = 0.6
frame["cost"] = frame["price"] * (1 - margin)
print(frame)
# product_id price cost
# 0 1 50 20.0
# 1 2 55 22.0
# 2 3 30 12.0
# 3 4 32 12.8
# 4 5 44 17.6
Dropping duplicated rows
import pandas as pd
frame = pd.DataFrame(
[
[1, 50],
[2, 55],
[3, 30],
[4, 32],
#🡓 Duplicated IDs
[5, 44], # <- this will be deleted
[5, 54], # <- let's keep only this last one
],
columns=["product_id", "price"],
)
print(frame)
# product_id price
# 0 1 50
# 1 2 55
# 2 3 30
# 3 4 32
# 4 5 44
# 5 5 54
frame.drop_duplicates(subset="product_id", keep="last", inplace=True)
print(frame)
# product_id price
# 0 1 50
# 1 2 55
# 2 3 30
# 3 4 32
# 5 5 54 # <-- kept
Dropping rows having null values
import pandas as pd
frame = pd.DataFrame(
[
[1, 50],
[2, 55],
[3, 30],
[4, None], # <-- let's drop this one
[5, 44],
],
columns=["product_id", "price"],
)
print(frame)
# product_id price
# 0 1 50.0
# 1 2 55.0
# 2 3 30.0
# 3 4 NaN # <--
# 4 5 44.0
frame.dropna(subset="price", inplace=True)
print(frame)
# product_id price
# 0 1 50.0
# 1 2 55.0
# 2 3 30.0
# 4 5 44.0
Modifying a column
import pandas as pd
frame = pd.DataFrame(
[
[1, 50],
[2, 55],
[3, 30],
[4, 32],
[5, 44],
],
columns=["product_id", "price"],
)
print(frame)
# product_id price
# 0 1 50
# 1 2 55
# 2 3 30
# 3 4 32
# 4 5 44
tax = 1.25
frame["price"] = frame["price"] * tax
print(frame)
# product_id price
# 0 1 62.50
# 1 2 68.75
# 2 3 37.50
# 3 4 40.00
# 4 5 55.00
Renaming columns
import pandas as pd
frame = pd.DataFrame(
[
[1, 50],
[2, 55],
[3, 30],
[4, 32],
[5, 44],
],
columns=["product_id", "price"],
)
print(frame)
# product_id price
# 0 1 50
# 1 2 55
# 2 3 30
# 3 4 32
# 4 5 44
frame.rename(columns={"price": "cost"}, inplace=True)
print(frame)
# product_id cost
# 0 1 50
# 1 2 55
# 2 3 30
# 3 4 32
# 4 5 44
Changing column data-type
import pandas as pd
frame = pd.DataFrame(
[
[1, 50.1],
[2, 55.3],
[3, 30.2],
[4, 32.7],
[5, 44.6],
],
columns=["product_id", "price"],
)
print(frame)
# product_id price
# 0 1 50.1
# 1 2 55.3
# 2 3 30.2
# 3 4 32.7
# 4 5 44.6
# Convert price to integer
frame = frame.astype({"price": int})
print(frame)
# product_id price
# 0 1 50
# 1 2 55
# 2 3 30
# 3 4 32
# 4 5 44
Filling missing data
import pandas as pd
frame = pd.DataFrame(
[
[1, 50],
[2, None],
[3, None],
[4, None],
[5, 44],
],
columns=["product_id", "price"],
)
print(frame)
# product_id price
# 0 1 50.0
# 1 2 NaN
# 2 3 NaN
# 3 4 NaN
# 4 5 44.0
frame["price"] = frame["price"].fillna(0)
print(frame)
# product_id price
# 0 1 50.0
# 1 2 0.0
# 2 3 0.0
# 3 4 0.0
# 4 5 44.0
Concatenating dataframes
import pandas as pd
columns = ["product_id", "price"]
frame1 = pd.DataFrame(
[
[1, 50],
[2, 55],
[3, 30],
],
columns=columns,
)
print(frame1)
# product_id price
# 0 1 50
# 1 2 55
# 2 3 30
frame2 = pd.DataFrame(
[
[4, 32],
[5, 44],
[6, 31],
],
columns=columns,
)
print(frame2)
# product_id price
# 0 4 32
# 1 5 44
# 2 6 31
frame3 = pd.concat([frame1, frame2])
print(frame3)
# product_id price
# 0 1 50
# 1 2 55
# 2 3 30
# 0 4 32
# 1 5 44
# 2 6 31
Pivot tables
import pandas as pd
frame = pd.DataFrame(
[
["Fabio", "Literature", 3],
["Fabio", "History", 8],
["Fabio", "Geography", 8],
["João", "Literature", 9],
["João", "History", 7],
["João", "Geography", 7],
["Pedro", "Literature", 7],
["Pedro", "History", 9],
["Pedro", "Geography", 7],
],
columns=["student", "exam", "score"]
)
print(frame)
# student exam score
# 0 Fabio Literature 3
# 1 Fabio History 8
# 2 Fabio Geography 8
# 3 João Literature 9
# 4 João History 7
# 5 João Geography 7
# 6 Pedro Literature 7
# 7 Pedro History 9
# 8 Pedro Geography 7
pivoted_frame = frame.pivot(index="exam", columns="student", values="score")
print(pivoted_frame)
# student Fabio João Pedro
# exam
# Geography 8 7 7
# History 8 7 9
# Literature 3 9 7
Melt dataframe (undo pivot)
import pandas as pd
frame = pd.DataFrame(
[
["Geography", 8, 7, 7],
["History", 8, 7, 9],
["Literature", 3, 9, 7],
],
columns=["exam", "Fabio", "João", "Pedro"]
)
print(frame)
# exam Fabio João Pedro
# 0 Geography 8 7 7
# 1 History 8 7 9
# 2 Literature 3 9 7
melted_frame = frame.melt(
id_vars=["exam"],
value_vars=["Fabio", "João", "Pedro"],
var_name="student",
value_name="score",
)
print(melted_frame)
# exam student score
# 0 Geography Fabio 8
# 1 History Fabio 8
# 2 Literature Fabio 3
# 3 Geography João 7
# 4 History João 7
# 5 Literature João 9
# 6 Geography Pedro 7
# 7 History Pedro 9
# 8 Literature Pedro 7
Chaining methods
An exampe of search and sorting.
import pandas as pd
frame = pd.DataFrame(
[
["Fabio", 6],
["Joao", 8],
["Pedro", 9],
["Aline", 10],
["Ricardo", 7],
["Tina", 8],
["Julio", 7],
["Rogério", 6],
["Maria", 9],
],
columns=["student", "score"]
)
print(frame)
# student score
# 0 Fabio 6
# 1 Joao 8
# 2 Pedro 9
# 3 Aline 10
# 4 Ricardo 7
# 5 Tina 8
# 6 Julio 7
# 7 Rogério 6
# 8 Maria 9
MIN_SCORE = 7
approved = (
frame[frame["score"] >= MIN_SCORE]
.sort_values(by="score", ascending=False)
)
print(approved)
# student score
# 3 Aline 10
# 2 Pedro 9
# 8 Maria 9
# 1 Joao 8
# 5 Tina 8
# 4 Ricardo 7
# 6 Julio 7