import pandas as pd
# creating a DataFrame
df = pd.DataFrame({
'name': ['Rachel', 'Jacob', 'Dom', 'Syral', 'Naboi'],
'city': ['Japan', 'NewYork', 'London', 'Itly', 'France'],
'status': ['active', 'inactive', 'active', 'active', 'inactive'],
'salary': [3000, 5000, 2000, 7000, 4000]
})
print("Original DataFrame:")
print(df)
# delete first column(name) from DataFrame - df
df.drop(columns=df.columns[0], axis=1, inplace=True)
print("DataFrame after deletion:")
print(df)
import pandas as pd
# create a DataFrame
df = pd.DataFrame({
'name': ['Rachel', 'Jacob', 'Dom', 'Syral', 'Naboi'],
'city': ['Japan', 'NewYork', 'London', 'Itly', 'France'],
'status': ['active', 'inactive', 'active', 'active', 'inactive']
})
print(df)
# get count of rows which has value - active for status column
total_active_users = df[df['status']=='active'].shape[0]
print(f'Total active users are: {total_active_users}')
import pandas as pd
from tabulate import tabulate
# create a dataframe
df = pd.DataFrame({
'subject': ['Physics', 'Chemistry', 'English', 'Arts', 'Math', 'Ecomomics'],
'score': [90, 89, 93, 76, 82, 87],
'department': ['science', 'science', 'literature', 'arts', 'science', 'commerce']
})
print(tabulate(df, headers='keys', tablefmt='fancy_grid'))
import pandas as pd
# create a dataframe
df = pd.DataFrame({
'subject': ['Physics', 'Chemistry', 'English', 'Arts', 'Math', 'Ecomomics'],
'score': [90, 89, 93, 76, 82, 87],
'department': ['science', 'science', 'literature', 'arts', 'science', 'commerce']
})
print('ORIGINAL DATAFRAME')
print(df)
# get all rows that contain a substring - sci
df = df[df['department'].str.contains('sci')]
print('RESULT DATAFRAME')
print(df)
import pandas as pd
# create a dataframe
df = pd.DataFrame({
'name': ['Rick', 'Carol', 'Daryl', 'Negan'],
'place': ['Alexendria', 'Kingdom', 'Hilltop', 'Saviours'],
'episode': [50, 30, 35, 29]
})
# add suffix
df = df.add_suffix('_watch')
print(df)
import pandas as pd
# create a dataframe
df = pd.DataFrame({
'subject': ['Math', 'Physics', 'Chemistry', 'English', 'Computer Science'],
'score': [90, 70, 94, 67, 69]
})
print("DataFrame: ")
print(df)
# use where() function
result = df.where(df['score'] > 80)
print("Result: ")
print(result)
import pandas as pd
# create a dataframe
df = pd.DataFrame({
'product_name': ['p1', 'p2', 'p3', 'p4'],
'price': [100.0, 30.0, 200.0, 140.0],
'quantity': [14, 67, 90, 40]
});
# add new row using df.loc[] method
result = df.loc[:,['price','quantity']]
print(result)
import pandas as pd
# create a dataframe
df = pd.DataFrame({
"A": ['a', 'b', 'c', 'd', 'e'],
"B": [10, 20, 30, 40, 50],
"C": [1, 2, 3, 4, 5]
});
# add a new row to df DataFrame
row_dict = {"A": 'f', "B": 60, "C": 6}
df = df.append(row_dict, ignore_index=True)
print(df)
report_old.rename(columns={
_col: _col+'_old' for _col in report_old.columns if _col not in ['item_id', ]
}, inplace=True)
report_old = report_old[report_old["vid_old"]==report_old["vid_old"]]
import pandas as pd
from tabulate import tabulate
# create a dataframe
df = pd.DataFrame({
'subject': ['Math', 'Physics', 'Chemistry', 'Data Structue', 'History'],
'score': [90, 80, 85, 40, 70],
'status': ['unknown', 'unknown', 'unknown', 'unknown', 'unknown']
})
print(df)
# change 'status' column value based on 'score' value
df.loc[(df['score'] > 50), 'status'] = 'Pass'
print(df)
import pandas as pd
df = pd.DataFrame({
"col1": ["a1", "a2", "a3", "a4", "a5"],
"col2": [1, 2, 3, 4, 5],
"col3": [10, 20, 30, 40, 50]
})
df = df.assign(col3=1)
print(df)
import pandas as pd
data = {
"col_1": [10, None, 13, None, None, 40],
"col_2": [5, 10, None, 15, 20, None],
"col_3": [20, 30, 50, None, None, None]
}
df = pd.DataFrame(data)
result = df.isna().sum().sum()
print(result)
import pandas as pd
# create a datatframe
df = pd.DataFrame({
'fruit': ['Banana', 'Apple', 'Mango', 'Orange', 'Papaya'],
'price': [50, 60, 90, 70, 30]
})
print(df)
# get all values from a column as a list
result = df['price'].tolist()
print(result)
import pandas as pd
# create a dataframe
df = pd.DataFrame({
"values": [1, 2, 1, 4, 2, 5],
"alphabets": ["A", "B", "A", "C", "B", "D"],
"numbers": [10, 20, 10, 40, 20, 50]
})
print(df)
# delete duplicate rows
df = df.drop_duplicates()
print(df)
import pandas as pd
# create a dataframe
df = pd.DataFrame({
"id": [1, 2, 3, 4, 5],
"name": ["Joy", "Rick", "Carol", "Dumpty", "Clark"],
"designation": ["Programmer", "Manager", "Admin", "Accountant", "Designer"]
})
print(df)
# get all the indexes
indexes = df.index.values.tolist()
print("All indexes of DataFrame df are: ", end="")
print(indexes)
import pandas as pd
# create a dataframe
df = pd.DataFrame({
"id": [1, 2, 3, 4, 5],
"name": ["Delhi", "New York", "Mumbai", "LA", "London"],
"pincode": [100, 200, 300, 400, 500]
})
print(df)
# detete rows using index list
df.drop(df.index[[0,3,4]], inplace=True)
# print the dataframe
print(df)
import pandas as pd
# create a dictionary
user_info = {
'name': ['Rick', 'John', 'Callumn', 'Manoj', 'Dustin'],
'address': ['New York', 'LA', 'London', 'Delhi', 'Holland'],
'designation': ['Programmer', 'Accountant', 'Admin', 'Tech Lead', 'Manager']
}
# create the dataframe from a dictionary
df = pd.DataFrame(user_info)
print(df)
import pandas as pd
# create a dictionary
students = {
"name": ["Tom", "John", "Rick", "Sneha", "Johnson"],
"score": [90, 80, 98, 99, 82],
"subjects": ["Math", "Physics", "Chemistry", "English", "Data Structure"]
}
# create the dataframe from dictionary
df = pd.DataFrame(students, index=['a', 'b', 'c', 'd', 'e'])
print("Original DataFrame: ")
print(df)
#change rows orders using index list
df = df.reindex(['d', 'a', 'e', 'b', 'c'])
print("After ordering using row index:")
print(df)
import pandas as pd
data = [
['a_row', 'a1', 'a2', 'a3'],
['b_row', 'b1', 'b2', 'b3'],
['c_row', 'c1', 'c2', 'c3']
]
# create dataframe
df = pd.DataFrame(data, columns=['col_row', 'col_1', 'col_2', 'col_3'], index=['a', 'b', 'c'])
print(df)
# change column orders using column names list
df = df[['col_3', 'col_2', 'col_row', 'col_1']]
print(df)
import pandas as pd
employees = [
['Robin', 30, 0, 'India'],
['Rick', 35, 0, 'US'],
['Tony Stark', 24, 0, 'US'],
['Roney', 24, 0, 'Canada'],
['Sumit', 24, 0, 'India'],
['Parek Bisth', 24, 0, 'India']
]
# create dataframe
df = pd.DataFrame(employees, columns=['Name', 'Age', 'PeerCount', 'Country'])
print(df)
# check if PeerCount column contains 0 values only
if (df['PeerCount'] == 0).all():
print('All values in PeerCount column are zero')
else:
print('All values in PeerCount column are not zero')
import pandas as pd
employees = [
['Robin', 30, 'SOS', 'India'],
['Rick', 28, 'PSA', 'US'],
['Tony Stark', 32, 'COS', 'US']
]
# create dataframe
df = pd.DataFrame(employees, columns=['Name', 'Age', 'Dept', 'Country'])
# check if column exist in the dataframe
if 'Age' in df.columns:
print("Coumn exists in the DataFrame")
else:
print("Column does not exist in the DataFrame")
import pandas as pd
# DataFrame 1
df1 = pd.DataFrame({'id': ['1000', '1001', '1002', '1003'],
'company': ['Devsheet', 'SpaceX', 'Tesla', 'Open AI']})
# DataFrame 2
df2 = pd.DataFrame({'location': ['India', 'US', 'US', 'US'],
'product': ['Website', 'Rocket', 'Car', 'ML']})
result = pd.concat([df1, df2], axis=1)
print(result)
import pandas as pd
data = {
'score_1': [30, 40, 25, 19, 60, 15],
'score_2': [3, 7, 9, 2, 5, 11]
}
# create dataframe
df = pd.DataFrame(data)
# sort DataFrame - ascending order
result = df.sort_values(by='score_1')
print(result)
import pandas as pd
employees = {
"name": ["Richard", "Patrick", "Sam", "Vick"],
"address": ["New York", "Atlanta", "Canada", "India"],
"salary": [10000, 20000, 15000, 22000]
}
# create dataframe
df = pd.DataFrame(employees)
print(df)
# add new column - deduction that has 10% of salary
df["deduction"] = df["salary"] * 10 / 100
print(df)
import pandas as pd
data = {
"subjects": ["Math", "Physics", "Chemistry", "English", "Hindi"],
"scores": [90, 80, 87, 80, 70]
}
df = pd.DataFrame(data)
# Get rows and columns count
result = df.shape
print("Rows: ", result[0])
print("Columns: ", result[1])
import pandas as pd
# create a List of Lists
data_list = [
["John", 30, "Math"],
["Tom", 50, "Physics"],
["Pettrick", 40, "Chemistry"],
["Travis", 35, "English"]
]
# create the dataframe from List
df = pd.DataFrame(data_list, columns=['name', 'score', 'subject'])
print(df)
import pandas as pd
data = {
'alphabets': ['a', 'b', 'c', 'd']
}
# Create dataframe
df = pd.DataFrame(data)
print(df)
# Add new column with default value 0
df['code'] = 0
print(df)
import pandas as pd
my_data = {
"names": ["Shivam", "John", "Prateek", "Gaurav"],
"profession": ["Engineer", "Carpenter", "Writer", "marketer"]
}
df = pd.DataFrame(my_data)
# Add new column with rows data
df['address'] = ['India', 'New York', 'India', 'India']
print(df)
import pandas as pd
subjects = {
"name": ["Math", "Physics", "Chemistry", "English", "Hindi"],
"score": [98, 91, 78, 80, 76],
"code": ["M01", "P02", "C04", "E02", "H08"]
}
# Create the dataframe
df = pd.DataFrame(subjects)
print(df)
# Delete a column
df = df.drop(columns=['name'])
print(df)
import pandas as pd
my_data = {
"names": ["Rick", "John", "Carol", "Gibson"],
"profession": ["Engineer", "Carpenter", "Writer", "marketer"]
}
df = pd.DataFrame(my_data)
print(df)
# This will get the list of all the rows in column names
result = df['names'].tolist()
print(result)
df.fillna('', inplace=True)
import pandas as pd
source_list = [1, 1, 3, 2, 3, 1, 2, 4]
result = pd.unique(source_list).tolist()
print(result)
import pandas as pd
data = {
'name': ['Rick', 'Carol', 'Carl', 'Negan'],
'place': ['Alexendria', 'Kingdom', 'Alexendria', 'Saviours']
}
df = pd.DataFrame.from_dict(data)
print(df)
# name place
# 0 Rick Alexendria
# 1 Carol Kingdom
# 2 Carl Alexendria
# 3 Negan Saviours
# Get dict from dataframe(Method 1)
dict1 = dict(df.values)
print(dict1)
# -> {'Rick': 'Alexendria', 'Carol': 'Kingdom', 'Carl': 'Alexendria', 'Negan': 'Saviours'}
# Get dict from dataframe(Method 2)
dict2 = df.to_dict('index')
print(dict2)
# -> {0: {'name': 'Rick', 'place': 'Alexendria'}, 1: {'name': 'Carol', 'place': 'Kingdom'}, 2: {'name': 'Carl', 'place': 'Alexendria'}, 3: {'name': 'Negan', 'place': 'Saviours'}}
#By creating a copy - using df.rename() method
df = df.rename(columns={'col_1': 'changed_col_1', 'col_2': 'changed_col_2'})
print(df)
#Without creating copy - using df.rename() method with inplace=True
df.rename(columns={'col_1': 'changed_col_1', 'col_2': 'changed_col_2'}, inplace=True)
print(df)
import pandas as pd
df = pd.DataFrame({
'Name': ['John', 'Eric', 'Rick'],
'Score': [200, 500, 100\]
})
df_columns = df.columns.values.tolist()
print(df_columns)
#prints - ['Name', 'Score']
import pandas as pd
df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
index=[1, 2, 3], columns=['A', 'B', 'C'])
#index - 3, column_name - 'C'
value = df.at[3, 'C']
print(value) # prints - 30
#this can also written as
value = df.loc[3].at['C']
print( value ) # prints - 30
import pandas as pd
df = pd.DataFrame({
'column_1': ['John', 'Eric', 'Rick'],
'column_2': [100, 110, 120]
})
for index, row in df.iterrows():
print(row['column_1'], row['column_2'])
#prints
# John 100
# Eric 110
# Rick 120
import pandas as pd
#first method - create empty dataframe
df = pd.DataFrame()
# append columns and rows to this dataframe
df['username'] = ['john', 'neil', 'curtis']
df['status'] = ['active', 'disabled', 'active']
print(df)
#----------------------------------------------------------------------------------#
#second method - create dataframe with columns
df = pd.DataFrame(columns = ['username', 'status'])
# append rows to the abo
df = df.append({'username' : 'john', 'status' : 'active'}, ignore_index = True)
df = df.append({'username' : 'neil', 'status' : 'disabled'}, ignore_index = True)
df = df.append({'username' : 'curtis', 'status' : 'active'}, ignore_index = True)
print(df)
#----------------------------------------------------------------------------------#
#make data frame with rows having - NaN values at index - a, b and c
df = pd.DataFrame(columns = ['username', 'status'], index = ['a', 'b', 'c'])
print(df)
# +---+----------+--------+
# | | username | status |
# +---+----------+--------+
# | a | NaN | NaN |
# | b | NaN | NaN |
# | c | NaN | NaN |
# +---+----------+--------+
# add rows at already created indexes
df.loc['a'] = ['john', 'active']
df.loc['b'] = ['neil', 'disabled']
df.loc['c'] = ['curtis', 'active']
print(df)
import pandas as pd
my_dict = [{'Name': 'John', 'Score': 20}, {'Name': 'Mark', 'Score': 50}, {'Name': 'Rick', 'Score': 70}]
df = pd.DataFrame.from_dict(my_dict)
print(df)
# +----+--------+---------+
# | | Name | Score |
# +-------------+---------+
# | 0 | John | 20 |
# | 1 | Mark | 50 |
# | 2 | Rick | 70 |
# +----+--------+---------+
import pandas as pd
# create dataframe and add column and rows to it
df_fruits = pd.DataFrame()
df_fruits["fruit_name"] = ["Orange", "Banana", "Apple", "Grapes"]
df_fruits["color"] = ["orange", "yellow", "red", "green"]
df_fruits["price"] = [50, 100, 150, 70]
# convert DataFrame to list of dictionaries
result = df_fruits.to_dict('records')
print(result)
import pandas as pd
df = pd.DataFrame()
df["Name"] = ["John", "Mark", "Rick"]
df["Score"] = [20, 50, 70]
print(df.to_dict())
# prints - {'Name': {0: 'John', 1: 'Mark', 2: 'Rick'}, 'Score': {0: 20, 1: 50, 2: 70}}
print(df.to_dict('split'))
# prints - {'index': [0, 1, 2], 'columns': ['Name', 'Score'], 'data': [['John', 20], ['Mark', 50], ['Rick', 70]]}
print(df.to_dict('records'))
# prints - [{'Name': 'John', 'Score': 20}, {'Name': 'Mark', 'Score': 50}, {'Name': 'Rick', 'Score': 70}]
print(df.to_dict('index'))
#prints - {0: {'Name': 'John', 'Score': 20}, 1: {'Name': 'Mark', 'Score': 50}, 2: {'Name': 'Rick', 'Score': 70}}
def outliers(column):
Q1, Q3 = np.percentile(column, [25, 75])
IQR = Q3 - Q1
lower_range = Q1 - (1.5 * IQR)
upper_range = Q3 + (1.5 * IQR)
return lower_range, upper_range
outliers(df['yearly_salary'])
dframe = dframe[['column 5','column 6',4,3,2,1]]