import pandas as pd


df = pd.DataFrame(columns=['A', 'B', 'C'])
print(df)
df

Empty DataFrame
Columns: [A, B, C]
Index: []


df = pd.DataFrame(columns=['A','B','C'], index=['W', 'X','Y','Z'])
df


data=[
    ["A",1,3],
    ["B",6,8],
    ["C",3,4],
    ["D",9,12],]
df = pd.DataFrame(data)
df


data={
    "Name":["John","Jo","Sam","April"],
    "Year":["Senior","Junior","Freshman","Junior"],
    "GPA":[3.5,3.0,4.0,3.8]
}
df = pd.DataFrame(data)
df


data=[
    {"Name":"John","GPA":3.5},
    {},
    {"Name":"Jo","Year":"Junior","Job":"Waiter","GPA":3.0},
    {"Name":"Sam","Job":"Data Analyst","GPA":4.0},
    {"Name":"April","Year":"Junior","GPA":3.8}]
df = pd.DataFrame(data)
df


apps = pd.read_table("data/googleplaystore.csv",delimiter=',')
apps.head()


app_reviews = pd.read_csv("data/googleplaystore_user_reviews.csv")
app_reviews.head()


df = pd.read_html("https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)")
df[0].head()


app_reviews.head()[["App"]]


app_reviews.head()[["Sentiment","Sentiment_Polarity","Sentiment_Subjectivity"]]


bools = app_reviews["Sentiment"]=="Positive"
print(bools)
app_reviews[bools]

0         True
1         True
2        False
3         True
4         True
         ...  
64290    False
64291    False
64292    False
64293    False
64294    False
Name: Sentiment, Length: 64295, dtype: bool


app_reviews[app_reviews["Sentiment"]=="Positive"]


apps[apps.index==1057]


apps.loc[[1057]]


app_reviews.loc[:,["App"]]


apps.loc[1057:1060]


df = pd.DataFrame(columns=['Name','Year','GPA'])
df.loc[0]=["John","Senior",3.5]
df.loc[1]=["Jo","Junior",3.0]
df.loc['a']=["Sam","Freshman",4.0]
df.loc['b']=["April","Junior",3.8]
df.loc['x','GPA']=4.0
df.loc['x','Year']="Freshman"
df


df.iloc[4,0]=["Alex"]
df


df.iloc[5]=["James","Senior",1.0]

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-20-d3d23e3ccd8d> in <module>
----> 1 df.iloc[5]=["James","Senior",1.0]

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
    687             key = com.apply_if_callable(key, self.obj)
    688         indexer = self._get_setitem_indexer(key)
--> 689         self._has_valid_setitem_indexer(key)
    690 
    691         iloc = self if self.name == "iloc" else self.obj.iloc

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _has_valid_setitem_indexer(self, indexer)
   1399             elif is_integer(i):
   1400                 if i >= len(ax):
-> 1401                     raise IndexError("iloc cannot enlarge its target object")
   1402             elif isinstance(i, dict):
   1403                 raise IndexError("iloc cannot enlarge its target object")

IndexError: iloc cannot enlarge its target object


print("App DataFrame Summary:")
print(apps.dtypes)
print(apps.shape)

App DataFrame Summary:
App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object
(10841, 13)


print("App Reviews DataFrame Summary:")
print(app_reviews.dtypes)
print(app_reviews.shape)

App Reviews DataFrame Summary:
App                        object
Translated_Review          object
Sentiment                  object
Sentiment_Polarity        float64
Sentiment_Subjectivity    float64
dtype: object
(64295, 5)


def floatTest(df, column):
    '''
    input:
        df: DataFrame to test
        column: string of column name to be tested
    '''
    # Test to make sure column is actually a column in df
    if column in df:
        try:
            # Loop over elements of the column and try to turn them into a float
            for i in df[column]:
                x=i
                float(i)
            print("Casting "+column+" to float is possible")
        except ValueError:
            # If an error occurs while trying to convert to a float, print the value that caused the error
            print("First non-float value in the " + column +" column: "+ x)
    else:
        print(column+" is not a column in the DataFrame provided")


floatTest(apps,"Reviews")
floatTest(apps,"Price")

First non-float value in the Reviews column: 3.0M
First non-float value in the Price column: $4.99


def uniqueTable(df):
    '''
    input:
        df: DataFrame 
    output:
        DataFrame with datatype and number of unique values per column
    '''
    # Create empty DataFrame with 2 columns: "Data Type" and "Unique Values"
    uniques = pd.DataFrame(columns=["Data Type", "Unique Values"])
    for i in df.columns:
        # Add the datatype and number of unique values to the DataFrame as a row indexed with the column name
        uniques.loc[i] = [df[i].dtype,df[i].nunique()]
    return uniques


print("Number of unique values in each column of apps DataFrame:")
uniqueTable(apps)

Number of unique values in each column of apps DataFrame:


print("Number of unique values in each column of app_reviews DataFrame:")
uniqueTable(app_reviews)

Number of unique values in each column of app_reviews DataFrame:


floatTest(apps,"Price")

First non-float value in the Price column: $4.99


apps["Price"] = apps["Price"].replace('\$', '', regex=True)


floatTest(apps,"Price")

First non-float value in the Price column: Everyone


apps[apps["Price"]=="Everyone"]


apps.loc[10472,"Rating":"Android Ver"] = apps.loc[10472,"Category":"Current Ver"].to_numpy()
apps.loc[10472,"Category"] = None
apps.loc[[10472]]


floatTest(apps,"Price")
floatTest(apps,"Reviews")

Casting Price to float is possible
Casting Reviews to float is possible


print("Memory usage of Apps DataFrame before cast: " + str(apps.memory_usage().sum()))

Memory usage of Apps DataFrame before cast: 1127592


casts = {
    "Category":"category",
    "Rating":"float64",
    "Reviews":"float64",
    "Installs":"category",
    "Type":"category",
    "Price": "float64",
    "Content Rating":"category",
    "Genres":"category",
    "Android Ver":"category"}
apps = apps.astype(casts)
apps.dtypes

App                 object
Category          category
Rating             float64
Reviews            float64
Size                object
Installs          category
Type              category
Price              float64
Content Rating    category
Genres            category
Last Updated        object
Current Ver         object
Android Ver       category
dtype: object


print("Memory usage of Apps Data after cast: " + str(apps.memory_usage().sum()))

Memory usage of Apps Data after cast: 681130


print("Memory usage of Sentiment Column before cast: " + str(app_reviews["Sentiment"].memory_usage()) + '\n')
app_reviews = app_reviews.astype({"Sentiment":'category'})
print("Memory usage of Sentiment Column after cast: " + str(app_reviews["Sentiment"].memory_usage())+'\n')
app_reviews.dtypes

Memory usage of Sentiment Column before cast: 514488

Memory usage of Sentiment Column after cast: 64555

App                         object
Translated_Review           object
Sentiment                 category
Sentiment_Polarity         float64
Sentiment_Subjectivity     float64
dtype: object


def nulls(df):
    '''
    input:
        df: DataFrame 
    '''
    # Loop over columns
    for col in df.columns:
        # Print the column name and number number of null values in it
        n=df[df[col].isna()]
        print(str(col)+" : "+str(len(n)))


nulls(apps)

App : 0
Category : 1
Rating : 1474
Reviews : 0
Size : 0
Installs : 0
Type : 1
Price : 0
Content Rating : 0
Genres : 1
Last Updated : 0
Current Ver : 8
Android Ver : 2


apps.dropna(subset=["Category","Type","Genres","Current Ver","Android Ver"],inplace=True)


apps_with_rating=apps.dropna()


nulls(apps)

App : 0
Category : 0
Rating : 1469
Reviews : 0
Size : 0
Installs : 0
Type : 0
Price : 0
Content Rating : 0
Genres : 0
Last Updated : 0
Current Ver : 0
Android Ver : 0


nulls(apps_with_rating)

App : 0
Category : 0
Rating : 0
Reviews : 0
Size : 0
Installs : 0
Type : 0
Price : 0
Content Rating : 0
Genres : 0
Last Updated : 0
Current Ver : 0
Android Ver : 0


nulls(app_reviews)

App : 0
Translated_Review : 26868
Sentiment : 26863
Sentiment_Polarity : 26863
Sentiment_Subjectivity : 26863


app_reviews.dropna(inplace=True)


apps.to_csv("data/cleaned/googleplaystoreclean.csv",mode='w',compression='zip')


apps_with_rating.to_hdf("data/cleaned/appdata.hd5",key='ratings',mode='w',format='table')
app_reviews.to_hdf("data/cleaned/appdata.hd5",key='reviews',format='table')


df1 = pd.DataFrame([
    [0,1,2],
    [1,2,3],
    [2,3,4]])
df2 = pd.DataFrame([
    [4,5,6],
    [6,7,8],
    [8,9,10]])
df3 = pd.DataFrame([
    [11,12,13],
    [13,14,15],
    [15,16,17]])


pd.concat([df1,df2,df3],axis=1,ignore_index=True)


pd.concat([df1,df2,df3],axis=0,keys=["A","B","C"])


df1.append([df2,df3],ignore_index=True)


data1 = {
    "Student":["John","Chris","James"],
    "Class":["A","A","C"],
    "Midterm_Grade":[50,60,80],
    "Final":[60,60,80],
}
data2 = {
    "Class":["A","B","C","D","E"],
    "Student Average":[77,88,99,87,78],
}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

print(df1)
print(df2)

  Student Class  Midterm_Grade  Final
0    John     A             50     60
1   Chris     A             60     60
2   James     C             80     80
  Class  Student Average
0     A               77
1     B               88
2     C               99
3     D               87
4     E               78


pd.merge(left=df1,right=df2,left_on="Class",right_on="Class",how="outer",validate="m:1")


df1.set_index("Class").join(df2.set_index("Class"),how="outer")


df = pd.read_csv("data/cleaned/googleplaystoreclean.csv",index_col=[0],compression='zip',usecols=["Category","Price"])


category_avg_price = df.groupby(by=["Category"],axis='index').mean()
category_avg_price


sorted_avg_price = category_avg_price.sort_values(by=["Price"],axis='index',ascending=False)
sorted_avg_price


analysis1 = sorted_avg_price.iloc[0:5]  # identical to sorted_avg_price.head()
analysis1


pd.read_csv("data/cleaned/googleplaystoreclean.csv",index_col=[0],usecols=["Category","Price"],compression='zip').groupby(by=["Category"],axis='index').mean().sort_values(by=["Price"],axis='index',ascending=False).iloc[0:5]


analysis1.to_html("data/cleaned/Top5MostExpensiveCategories.html")


rating = pd.read_hdf("data/cleaned/appdata.hd5",key="ratings",columns=["App","Rating"])
review = pd.read_hdf("data/cleaned/appdata.hd5",key="reviews",columns=["App","Sentiment"])


rating = rating[rating["Rating"]>=4.5]


review["Sentiment"].dtypes

CategoricalDtype(categories=['Negative', 'Neutral', 'Positive'], ordered=False)


review.replace({"Positive":1,"Neutral":0,"Negative":-1},inplace=True)


rating_review = rating.merge(right=review,how="inner",left_on="App",right_on="App")
rating_review


analysis2 = rating_review.groupby(by="App",sort=False).mean().sort_values(by=["Sentiment","Rating"],ascending=False)
analysis2


analysis2_pivot = rating_review.pivot_table(index="App").sort_values(by=["Sentiment","Rating"],ascending=False)
analysis2_pivot.eq(analysis2).all()

Rating       True
Sentiment    True
dtype: bool


analysis2.to_hdf("data/cleaned/appdata.hd5",key='analysis')

	A	B	C
W	NaN	NaN	NaN
X	NaN	NaN	NaN
Y	NaN	NaN	NaN
Z	NaN	NaN	NaN

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver
0	Photo Editor & Candy Camera & Grid & ScrapBook	ART_AND_DESIGN	4.1	159	19M	10,000+	Free	Everyone	Art & Design	January 7, 2018	1.0.0	4.0.3 and up
1	Coloring book moana	ART_AND_DESIGN	3.9	967	14M	500,000+	Free	Everyone	Art & Design;Pretend Play	January 15, 2018	2.0.0	4.0.3 and up
2	U Launcher Lite – FREE Live Cool Themes, Hide ...	ART_AND_DESIGN	4.7	87510	8.7M	5,000,000+	Free	Everyone	Art & Design	August 1, 2018	1.2.4	4.0.3 and up
3	Sketch - Draw & Paint	ART_AND_DESIGN	4.5	215644	25M	50,000,000+	Free	Teen	Art & Design	June 8, 2018	Varies with device	4.2 and up
4	Pixel Draw - Number Art Coloring Book	ART_AND_DESIGN	4.3	967	2.8M	100,000+	Free	Everyone	Art & Design;Creativity	June 20, 2018	1.1	4.4 and up

	App	Translated_Review	Sentiment	Sentiment_Polarity	Sentiment_Subjectivity
0	10 Best Foods for You	I like eat delicious food. That's I'm cooking ...	Positive	1.00	0.533333
1	10 Best Foods for You	This help eating healthy exercise regular basis	Positive	0.25	0.288462
2	10 Best Foods for You	NaN	NaN	NaN	NaN
3	10 Best Foods for You	Works great especially going grocery store	Positive	0.40	0.875000
4	10 Best Foods for You	Best idea us	Positive	1.00	0.300000

	Country/Area	UN continentalregion[4]	UN statisticalsubregion[4]	Population(1 July 2018)	Population(1 July 2019)	Change
0	China[a]	Asia	Eastern Asia	1427647786	1433783686	+0.43%
1	India	Asia	Southern Asia	1352642280	1366417754	+1.02%
2	United States	Americas	Northern America	327096265	329064917	+0.60%
3	Indonesia	Asia	South-eastern Asia	267670543	270625568	+1.10%
4	Pakistan	Asia	Southern Asia	212228286	216565318	+2.04%

	App
0	10 Best Foods for You
1	10 Best Foods for You
2	10 Best Foods for You
3	10 Best Foods for You
4	10 Best Foods for You

Pandas DataFrames Tutorial¶

By: Dylan Manchester ¶

Introduction¶

Creating DataFrames¶

DataFrame Method¶

No Data¶

List of Lists¶

Dictionary of Lists¶

List of Dictionaries¶

Reading Data¶

Flat Files¶

HTML¶

Accessing Data¶

Basic Indexing¶

Boolean Indexing¶

Loc Method¶

Iloc Method¶

Exploring DataFrames¶

Overview¶

Data Cleaning¶

Modifying Values¶

Changing Column Types¶

Removing Null Values¶

Writing Data¶

To CSV¶

To HDF¶

Manipulation Methods¶

Concatinate¶

Append¶

Merge¶

Join¶

Analysis¶

Most Expensive Category¶

Well Liked Apps¶

Conclusion¶

	Name	GPA	Year	Job
0	John	3.5	NaN	NaN
1	NaN	NaN	NaN	NaN
2	Jo	3.0	Junior	Waiter
3	Sam	4.0	NaN	Data Analyst
4	April	3.8	Junior	NaN

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver
1057	Rabo Banking	FINANCE	3.4	31906	Varies with device	1,000,000+	Free	Everyone	Finance	July 19, 2018	5.16.0	4.0 and up
1058	Capitec Remote Banking	FINANCE	4.3	20672	Varies with device	1,000,000+	Free	Everyone	Finance	May 8, 2018	Varies with device	Varies with device
1059	Itau bank	FINANCE	4.2	957973	40M	10,000,000+	Free	Everyone	Finance	July 30, 2018	6.5.7	4.2 and up
1060	Nubank	FINANCE	4.7	130582	24M	5,000,000+	Free	Everyone	Finance	August 2, 2018	Varies with device	Varies with device

	Name	Year	GPA
0	John	Senior	3.5
1	Jo	Junior	3.0
a	Sam	Freshman	4.0
b	April	Junior	3.8
x	NaN	Freshman	4.0

	Data Type	Unique Values
App	object	9660
Category	object	34
Rating	float64	40
Reviews	object	6002
Size	object	462
Installs	object	22
Type	object	3
Price	object	93
Content Rating	object	6
Genres	object	120
Last Updated	object	1378
Current Ver	object	2832
Android Ver	object	33

	Data Type	Unique Values
App	object	1074
Translated_Review	object	27994
Sentiment	object	3
Sentiment_Polarity	float64	5410
Sentiment_Subjectivity	float64	4474

	Student	Class	Midterm_Grade	Final	Student Average
0	John	A	50.0	60.0	77
1	Chris	A	60.0	60.0	77
2	James	C	80.0	80.0	99
3	NaN	B	NaN	NaN	88
4	NaN	D	NaN	NaN	87
5	NaN	E	NaN	NaN	78

	Student	Midterm_Grade	Final	Student Average
Class
A	John	50.0	60.0	77
A	Chris	60.0	60.0	77
B	NaN	NaN	NaN	88
C	James	80.0	80.0	99
D	NaN	NaN	NaN	87
E	NaN	NaN	NaN	78

	Price
Category
ART_AND_DESIGN	0.093281
AUTO_AND_VEHICLES	0.158471
BEAUTY	0.000000
BOOKS_AND_REFERENCE	0.520739
BUSINESS	0.402761
COMICS	0.000000
COMMUNICATION	0.214832
DATING	0.134316
EDUCATION	0.115128
ENTERTAINMENT	0.053557
EVENTS	1.718594
FAMILY	1.236682
FINANCE	7.925765
FOOD_AND_DRINK	0.066772
GAME	0.251136
HEALTH_AND_FITNESS	0.197478
HOUSE_AND_HOME	0.000000
LIBRARIES_AND_DEMO	0.011786
LIFESTYLE	6.180288
MAPS_AND_NAVIGATION	0.196715
MEDICAL	3.110065
NEWS_AND_MAGAZINES	0.014064
PARENTING	0.159667
PERSONALIZATION	0.390949
PHOTOGRAPHY	0.400627
PRODUCTIVITY	0.591816
SHOPPING	0.021077
SOCIAL	0.054136
SPORTS	0.260417
TOOLS	0.316599
TRAVEL_AND_LOCAL	0.193605
VIDEO_PLAYERS	0.059771
WEATHER	0.395366

	App	Rating	Sentiment
0	Colorfit - Drawing & Coloring	4.7	1
1	Colorfit - Drawing & Coloring	4.7	-1
2	Colorfit - Drawing & Coloring	4.7	1
3	Colorfit - Drawing & Coloring	4.7	-1
4	Colorfit - Drawing & Coloring	4.7	1
...	...	...	...
26878	A+ Gallery - Photos & Videos	4.5	1
26879	A+ Gallery - Photos & Videos	4.5	1
26880	A+ Gallery - Photos & Videos	4.5	1
26881	A+ Gallery - Photos & Videos	4.5	0
26882	A+ Gallery - Photos & Videos	4.5	1

	Rating	Sentiment
App
Down Dog: Great Yoga Anywhere	4.9	1.000000
GPS Speedometer and Odometer	4.8	1.000000
Brightest Flashlight Free ®	4.7	1.000000
Calculator - unit converter	4.7	1.000000
Daniel Tiger for Parents	4.7	1.000000
...	...	...
Cooking Fever	4.5	-0.177778
Call Blocker	4.6	-0.333333
Free Live Talk-Video Call	4.7	-1.000000
Discover Mobile	4.6	-1.000000
Fruit Block - Puzzle Legend	4.6	-1.000000

Pandas DataFrames Tutorial¶

By: Dylan Manchester¶

Introduction¶

Creating DataFrames¶

DataFrame Method¶

No Data¶

List of Lists¶

Dictionary of Lists¶

List of Dictionaries¶

Reading Data¶

Flat Files¶

HTML¶

Accessing Data¶

Basic Indexing¶

Boolean Indexing¶

Loc Method¶

Iloc Method¶

Exploring DataFrames¶

Overview¶

Data Cleaning¶

Modifying Values¶

Changing Column Types¶

Removing Null Values¶

Writing Data¶

To CSV¶

To HDF¶

Manipulation Methods¶

Concatinate¶

Append¶

Merge¶

Join¶

Analysis¶

Most Expensive Category¶

Well Liked Apps¶

Conclusion¶

By: Dylan Manchester ¶