Pandas Quick Guide
Modules
import pandas as pd #pandas module
import scipy.stats as stats #stats sub-module
from bs4 import BeautifulSoup #scrape module
import requests #API module
import whois as whois #whois module
Files
df = pd.read_csv(“CookieData.csv”) #import CSV file to dataframe df
#define index column and convert dates to date format
air_quality = pd.read_csv(“air_quality_long.csv”, index_col=”date.utc”, parse_dates=True)
air_quality.to_csv(“checkcombine.csv”) #save dataframe to csv file
tests.to_excel(“tests.xlsx”, sheet_name=”expt”, index=False)
Dataframe info
df.describe() #output descriptive statistics of dataframe (numeric columns)
df.info() #display dataframe columns datatypes No-Null
df.dtypes #output datatypes of dataframe
df.shape #outputs rows and columns
df.sample(5) #output sample of 5 rows
df.columns #ouput columns
titanic.head(5) #print head of dataframe (top five rows)
titanic.tail(5) #print tail of dataframe (bottom five rows)
type(df) #output object type
Convert
gg[‘YearString’] = gg[‘Year’].astype(“string”) #convert series to string
gg[“YearInterval”] = gg[“YearString”].str[:3] #LEFT type function, get first 3 elements of string
gg[“YearInterval”] = gg[“YearInterval”].astype(int) #convert series integer
Delete Columns
del df2[“Outlier_Flag”]
Sub-set
df2 = df[[“version”, “retention_1”]]
Combine Sub-sets
#Concatonate Sub-sets – creates a union between the 2 tables. The Keys argument adds another hierachical index to show which table the data comes from
air_quality = pd.concat([air_quality_pm25, air_quality_no2], axis=0, keys=[“PM25”, “NO2”]) # creates union
merge the columns from the stations_coord dataframe to the air_quality df. Use location as the key to join the two dataframes, how=”left” is left join from air_quality on the left
air_quality = pd.merge(air_quality, stations_coord, how=”left”, on=”location”)
merge left join using parameter column on left column and id on right
air_quality = pd.merge(air_quality, air_quality_parameters,
how=’left’, left_on=’parameter’, right_on=’id’)
Dataframe Aggregate Functions
titanic[“Age”].mean()
titanic[[“Age”, “Fare”]].median()
Sort Dataframe
air_quality = air_quality.sort_values(“date.utc”)
sort1 = titanic.sort_values(by=”Age”).head()
sort2 = titanic.sort_values(by=[‘Pclass’, ‘Age’], ascending=False)~
no2_subset = no2.sort_index().groupby([“location”]).head(2) #group and sort by index
Filtering Dataframe
df2 = df[df.Outlier_Flag == 0]
gate_30 = df[df.version == “gate_30”]
no2 = air_quality[air_quality[“parameter”] == “no2”]
above_35 = titanic[titanic[“Age”] > 35]
class_23 = titanic[(titanic[“Pclass”] == 2) | (titanic[“Pclass”] ==3)]
age_no_na = titanic[titanic[“Age”].notna()] #return column not NA
#Return Names of passengers over 35
adult_names = titanic.loc[titanic[“Age”] > 35, “Name”]
titanic.iloc[9:25, 2:5] #Return specified columns and rows
titanic.iloc[0:3, 3] = “anonymous” #Change the 3rd element of the row to anonymous
def city_select(country): #function with country as parameter
df2 = df[df[“country”] == country] #slice dataframe where country = country e.g. United States
df2.sort_values(by=[“population”], ascending=False) #sort dataframe by population descending
df3 = df2.iloc[0:5] #new dataframe sorted by top 5 population
df4 = df3[“city”] #select city series (top 5 cities by population)
return df4
def latlong(country, city): #define function
selection = df[(df[“country”] == country) & (df[“city”] == city)] #filter dataframe by 2 criteria
return selection #return filtered dataframe
def get_lat(selection): #define function
latitude = selection[“lat”] #latitude = series with “lat” as column
lat = latitude.iloc[0] #get first row of latitude series
return lat #return first row
Groupby
df.groupby(“version”).count() #Version with Count version
df.groupby(“version”).sum() #Version with Sum columns
air_quality.groupby([air_quality[“datetime”].dt.weekday, “location”])[“value”].mean() #avg by day and location
air_quality.groupby(air_quality[“datetime”].dt.hour)[“value”].mean().plot(kind=’bar’, rot=0, ax=axs)
gg3 = gg.groupby([‘YearInterval’, ‘Month’], as_index=False)#Version with Sum columns
Create Pivot Table
table = pd.pivot_table(gate_30, values=’sum_gamerounds’, index=[‘version’], columns=[‘retention_1’],
aggfunc=”mean”, margins=”true”) #note margins give totals
no_2 = air_quality.pivot(index=”datetime”, columns=”location”, values=”value”)
air_quality.pivot_table(values=”value”, index=”location”, columns=”parameter”, aggfunc=”mean”) #wide format
DateTime
df[“datetime”] = pd.to_datetime(air_quality[“datetime”]) #change dates in dataframe to datetime formate
pd.read_csv(“../data/df_long.csv”, parse_dates=[“datetime”]) #does the same
df[“datetime”].min(), df[“datetime”].max() #start and end datetime
df[“datetime”].max() – df[“datetime”].min() #difference in Datetime between 2 dates
df[“month”] = df[“datetime”].dt.month #add column month based on other column
df.index.year, df.index.weekday #return year and weekday of datetime object.
df_pivoted = df.pivot(columns=”location”, values=”value”).reset_index() #reset pivot table index
df = df_pivoted.melt(id_vars=”date.utc”)
gg[“YearMonth”] = pd.to_datetime(dict(year=gg.Year, month=gg.Month, day=1)) #create date from columns year, month and 1 for first
gg3 = gg.groupby([‘YearInterval’, ‘Month’], as_index=False)#Version with Sum columns
gg4 = gg3.max() #change group by to max group by
Plot
df[“2019-05-20″:”2019-05-21”].plot(); #plot timerange
Working with Columns
air_quality.city.unique() #return unique cities in column cities
air_quality = air_quality.rename(columns={“date.utc”: “datetime”}) #rename columns
Split Columns in Dataframe
titanic[“Name”].str.split(“,”) #split Name column by comma
titanic[“Surname”] = titanic[“Name”].str.split(“,”).str.get(0) #split name column by comma and return the first value in the array.
df[“Name”].str.lower() #change column name to lowercase
df[“Name”].str.contains(“Countess”) #return true if column contains “Countess
df[“Name”].str.len() #return rows with string lenth of the Name column
df[“Name”].str.len().idxmax() #return length of longest name
df.loc[titanic[“Name”].str.len().idxmax(), “Name”] #return longest name in column using loc
#rename gender column and replace genders to M and F in dataframe
df[“gender_short”] = df[“gender”].replace({“male”: “M”, “female”: “F”}) #replace names in gender column with M and F
titanic[“Pclass”].isin([2, 3]) #checkss if 2 and 3 is in the PClass columns
adult_names = titanic.loc[titanic[“Age”] > 35, “Name”] #return Names of passengers over 35
print(“\n1 Day Retention by Version”) #\n for new line
print(f”You searched for: {domain_name}\n”) #f allows you to include placeholders in print
Loops
for link in a:
print(link.get
Input
domain = input(“Enter domain to search for: “)
Leave a Reply