Notes from Getting Started Tutorial:
https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html
- Create a Pandas Series and DataFrame
- Create Graphs with Matplotlib
- Adding new Columns from Existing Columns (inc. split)
- Summary Statistics including groupby
- Reshape layout of tables incl. Sorting and Pivot tables
- Sub-sets and filtering
- Combining multiple tables data (Concatonate (union) and merge (Join)
- Working with timeseries (pd.to_datetime(), dt, DatetimeIndex and Resample)
- Textual data (.str and .replace and split)
1. Creating a Dataframe and Series
import pandas as pd
#Create a series
books= pd.Series(["Papillion", "Great Expectations", "Magic"], name="Books") #Create a series
#Create DataFrame with Key value pairs. In this case the values are lists of values.
df = pd.DataFrame(
{
"Name": [
"Humpfries, Mr. Bill",
"Derby, Mrs. Sheila",
"Winfrey, Miss. Elizabeth",
],
"Age": [44, 66, 42],
"Gender": ["male", "female", "female"],
"City": ["London", "Manchester", "Leeds"],
"Occupation": ["Doctor", "Architect", "Analyst"]
}
)
#Using the list
df["Age"] #Output Age Column
#df["Age"].max() #outputs max age
2. Create Plots with Matplotlib
import pandas as pd
import matplotlib.pyplot as plt
#Create Pandas dataframe from csv file
air_quality = pd.read_csv("air_quality_no2.csv", index_col=0, parse_dates=True)
air_quality["station_paris"].plot() #craate plot of subset from DataFrame
plt.show()
#Plot with Matplot lib
air_quality.plot.scatter(x="station_london", y="station_paris", alpha=0.5)
plt.show()
#Create scatter from DataFrame
air_quality.plot.box() #Create box plot
plt.show() #Display boxplot
#subplots
fig, axs = air_quality.plot.area(figsize=(10, 4), subplots=True)
plt.show()
3. Add new columns from existing columns
Import pandas as pd
#index_col=0, sets first column to index, parse_dates converts dates to date type
air_quality = pd.read_csv("air_quality_no2.csv", index_col=0, parse_dates=True)
#Add new column based on existing column - this adds column to DataFrame
air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.882
#Another example, this time a %
air_quality["ratio_paris_antwerp"] = (air_quality["station_paris"] / air_quality["station_antwerp"]
#renaming columns
air_quality_renamed = air_quality.rename(columns={"station_antwerp": "BETR801","station_paris": "FR04014", "station_london": "London Westminster"})
4. Summary Statistics
import pandas as pd
#read_csv function - also read_ excel, sql, json, parquet and more
titanic = pd.read_csv("titanic.csv") #read_csv function reads from csv
titanic["Age"].mean() #average of Age column
titanic[["Age", "Fare"]].median() #median of Age and Fare Coluns
titanic[["Age", "Fare"]].describe() #summary stats of 2 columns
titanic.agg({"Age": ["min", "max", "median", "skew"],"Fare": ["min", "max", "median", "mean"]})
#multiple stats on Age and Fare columns
#groupby to create summary tables
titanic[["Sex", "Age"]].groupby("Sex").mean()
#Average age by sex
titanic.groupby("Sex").mean(numeric_only=True)
#average of all numberics by Sex
titanic.groupby(["Sex", "Pclass"])["Fare"].mean() #table average fare by sex and PClass
titanic["Pclass"].value_counts() #count passengers in Pclass
titanic.groupby("Pclass")["Pclass"].count() #count longer method
5. Re-shape layout of tables
import pandas as pd
titanic = pd.read_csv("titanic.csv") #read_csv function reads from csv
air_quality = pd.read_csv("air_quality_long.csv", index_col="date.utc", parse_dates=True)
#Sorting Tables
sort1 = titanic.sort_values(by="Age").head()
##sort by Age
sort2 = titanic.sort_values(by=['Pclass', 'Age'], ascending=False) #sort by Pclas then Age descending
# filter for no2 data only
no2 = air_quality[air_quality["parameter"] == "no2"]
# filter for no2 data only<br>no2_subset = no2.sort_index().groupby(["location"]).head(2)
no2_subset
#pivot table wide format
air_quality.pivot_table(values="value", index="location", columns="parameter", aggfunc="mean")
air_quality.pivot_table(values="value",index="location",columns="parameter",aggfunc="mean",margins=True)
#wide to long format
no2_pivoted = no2.pivot(columns="location", values="value").reset_index()
no2_pivoted.head(5)
#pandas.melt() method converts pivot table to long format
no_2 = no2_pivoted.melt(id_vars="date.utc")<br>no_2.head(5)
6. Sub-sets and Filtering
import pandas as pd
#read csv file to pandas DatafFrame
titanic = pd.read_csv(“titanic.csv”)
air_quality = pd.read_csv(“air_quality_no2.csv”, index_col=0, parse_dates=True)
#save as excel, index=False removes row index numbers from export
titanic.to_excel(“titanic.xlsx”, sheet_name=”passengers”, index=False)
#attributes and methods of DataFrame
titanic.shape #row and columns count
titanic.info #
titanic.columns #
titanic.dtypes #data types
titanic.head(5) #head of df
titanic.tail(5) #tail of df
titanic.describe() #statistics of file
#Create Sub-set of DataFrame as series
ages = titanic[“Age”] #create sub-set from dataframe: note single bracket creates a pandas series.
type(titanic[“Age”]) #confirm type of Age object
#Create Sub-set of DataFrame as DataFrame
age_sex = titanic[[“Age”, “Sex”]] #inner brackets are list, outer is DataFrame
#Filtering rows getpassenger list with Age > 35
above_35 = titanic[titanic[“Age”] > 35] #return rows with age over 35
#Filter passenger list with Pclass = 2 or 3 (2 ways of doing this)
titanic[“Pclass”].isin([2, 3]) #checks if 2 or 3 is in Pclass and returns Boolean
titanic[“Pclass”].isin([2, 3])
#Filter passenger list with Age not NA
age_no_na = titanic[titanic[“Age”].notna()]
#Filter passenger liset and return list of Names
adult_names = titanic.loc[titanic[“Age”] > 35, “Name”]
#Filter specified rows and columns
titanic.iloc[9:25, 2:5]
#change the first 3 elements in row 3 to “anonymous”
titanic.iloc[0:3, 3] = “anonymous”
7. Combining Data from tables
import pandas as pd
import os
#check current working director
cwd = os.getcwd()
cwd
#Import data into DataFrames
titanic = pd.read_csv(“titanic.csv”)
air_quality_no2 = pd.read_csv(“air_quality_no2_long.csv”, parse_dates=True)
air_quality_pm25 = pd.read_csv(“air_quality_long.csv”, parse_dates=True)
stations_coord = pd.read_csv(“air_quality_stations.csv”)
air_quality_parameters = pd.read_csv(“air_quality_parameters.csv”)
#Create Sub-sets
air_quality_no2 = air_quality_no2[[“date.utc”, “location”,
“parameter”, “value”]]
air_quality_pm25 = air_quality_pm25[[“date.utc”, “location”,
“parameter”, “value”]]
#Concatonate Sub-sets – creates a union between the 2 tables.
#the Keys argument adds another hierachical index to show which table the data comes from
air_quality = pd.concat([air_quality_pm25, air_quality_no2], axis=0, keys=[“PM25”, “NO2”])
air_quality_no2.head(5)
air_quality_pm25.head(5)
air_quality_no2.shape #2068 rows 4 columns
air_quality_pm25.shape #5274 rows, 4 columns
air_quality.shape #7340 rows, 4 columms
air_quality = air_quality.sort_values(“date.utc”)
air_quality.head(25)
air_quality.to_csv(“checkcombine.csv”)
#note you can reset the index to a column
air_quality.reset_index(level=0)
stations_coord = pd.read_csv(“air_quality_stations.csv”)
stations_coord.head()
#merge the columns from the stations_coord dataframe to the air_quality df
#use location as the key to join the two dataframes
#how=”left” is left join from air_quality on the left
air_quality = pd.merge(air_quality, stations_coord, how=”left”, on=”location”)
air_quality.head(10)
air_quality.to_csv(“checkqualitymerge.csv”)
air_quality_parameters.head(5)
#merge left join using parameter column on left column and id on right
air_quality = pd.merge(air_quality, air_quality_parameters,
how=’left’, left_on=’parameter’, right_on=’id’)
8. Timeseries
import pandas as pd
import matplotlib.pyplot as plt
air_quality = pd.read_csv(“air_quality_no2_long.csv”)
air_quality = air_quality.rename(columns={“date.utc”: “datetime”})
air_quality.head(5)
air_quality.columns
#unique cities in air_quality df
air_quality.city.unique()
#read air_qualitydataframe to a datetime object instead of text
air_quality[“datetime”] = pd.to_datetime(air_quality[“datetime”])
air_quality[“datetime”]
#can also do this to read the dates into datetime format
#pd.read_csv(“../data/air_quality_no2_long.csv”, parse_dates=[“datetime”])
#check start and end datetime
air_quality[“datetime”].min(), air_quality[“datetime”].max()
#datetime difference using pandas timestamp
air_quality[“datetime”].max() – air_quality[“datetime”].min()
#Add a month column using the
air_quality[“month”] = air_quality[“datetime”].dt.month
#Average no2 concentration by day of week by location
air_quality.groupby([air_quality[“datetime”].dt.weekday, “location”])[“value”].mean()
#Plot bar graph using hour on the x axis and average no2 on the y
fig, axs = plt.subplots(figsize=(12, 4))
air_quality.groupby(air_quality[“datetime”].dt.hour)[“value”].mean().plot(
kind=’bar’, rot=0, ax=axs)
plt.xlabel(“Hour of the day”); # custom x label using Matplotlib
plt.ylabel(“$NO_2 (µg/m^3)$”);
#create pivot table
no_2 = air_quality.pivot(index=”datetime”, columns=”location”, values=”value”)
no_2.head(5)
#use index.year and index.weekday of the datetime object
no_2.index.year, no_2.index.weekday
#plot using a time range
no_2[“2019-05-20″:”2019-05-21”].plot();
#resample method of datetime object works like groupby. It takes an
#aggregation function and the “M” is time based grouping for Month (month end)
monthly_max = no_2.resample(“M”).max()
monthly_max
monthly_max.index.freq #tells you the frequence attribute – e.g. MonthEnd
#Create table plot of average no2 levels for each station
no_2.resample(“D”).mean().plot(style=”-o”, figsize=(10, 5));
9. Textual Data
import pandas as pd
titanic = pd.read_csv(“titanic.csv”)
titanic[“Name”].str.lower()
#Split the Name field by comma separated
titanic[“Name”].str.split(“,”)
#create series surname using the first element of the split name field
titanic[“Surname”] = titanic[“Name”].str.split(“,”).str.get(0)
titanic[“Surname”]
#Return name series with true false if contains Countess
titanic[“Name”].str.contains(“Countess”)
#return rows where name contains Countess
titanic[titanic[“Name”].str.contains(“Countess”)]
#string length
titanic[“Name”].str.len()
#find max name length
titanic[“Name”].str.len().idxmax()
#Return longest name
titanic.loc[titanic[“Name”].str.len().idxmax(), “Name”]
titanic[“Sex_short”] = titanic[“Sex”].replace({“male”: “M”, “female”: “F”})
titanic[“Sex_short”]
Leave a Reply