Skip to main content

WeRateDogs Complete Project

wrangle_act

#Import libraries

In [1]:
import pandas as pd
import requests
import tweepy
from tweepy import OAuthHandler
from tweepy import API
from tweepy import Cursor
import time
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#data gathering section

Do the following activities: 1.Read from archive file

2.Read from TSV file with URL 3.Read from twitter via Twitter API

In [2]:
#Read CSV file into a dataframe using pandas read-csv function.
archive_df = pd.read_csv('twitter-archive-enhanced.csv')
In [3]:
#Read TSV file from a URL using requests function.
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
r = requests.get(url, allow_redirects=True)
open('image_predictions.tsv', 'wb').write(r.content)

# read tsv file to dataframe
images_df = pd.read_csv('image-predictions.tsv', sep = '\t', encoding = 'utf-8')
In [4]:
#Read data from twitter using twitter API

consumer_key = '**' # fetched from twitter dev profile.
consumer_secret = '**' #fetched from twittter dev profile.
access_token = '**' #fetched from twitter dev profile.
access_token_secret = '**' #fetched from twitter dev profile.

#tweepy function for api access
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)


api = tweepy.API(auth, 
                 parser = tweepy.parsers.JSONParser(), wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

# declare dictionary to hold twitter data
tweet_df_list = []


# Get the tweet details for every tweet id from archive dataframe 
for tweet_id in archive_df['tweet_id']:
    try:
    
        page = api.get_status(tweet_id, tweet_mode = 'extended')
        favorites = page['favorite_count'] 
        retweets = page['retweet_count'] 
        user_followers = page['user']['followers_count'] 
        user_favourites = page['user']['favourites_count'] 
        date_time = page['created_at'] 
        
        tweet_df_list.append({'tweet_id': int(tweet_id),
                        'favorites': int(favorites),
                        'retweets': int(retweets),
                        'user_followers': int(user_followers),
                        'user_favourites': int(user_favourites),
                        'date_time': pd.to_datetime(date_time)})
        # Catch the exceptions of the TweepError
    except Exception as e:
        print(str(tweet_id)+ " _ " + str(e))
        
    
    # convert dictionary to dataframe
tweet_df = pd.DataFrame(tweet_df_list, columns = ['tweet_id', 'favorites', 'retweets',
                                               'user_followers', 'user_favourites', 'date_time'])

# Save the dataFrame in file
tweet_df.to_csv('tweet_list.txt', encoding = 'utf-8', index=False)
888202515573088257 _ [{'code': 144, 'message': 'No status found with that ID.'}]
873697596434513921 _ [{'code': 144, 'message': 'No status found with that ID.'}]
869988702071779329 _ [{'code': 144, 'message': 'No status found with that ID.'}]
866816280283807744 _ [{'code': 144, 'message': 'No status found with that ID.'}]
861769973181624320 _ [{'code': 144, 'message': 'No status found with that ID.'}]
845459076796616705 _ [{'code': 144, 'message': 'No status found with that ID.'}]
842892208864923648 _ [{'code': 144, 'message': 'No status found with that ID.'}]
837012587749474308 _ [{'code': 144, 'message': 'No status found with that ID.'}]
827228250799742977 _ [{'code': 144, 'message': 'No status found with that ID.'}]
802247111496568832 _ [{'code': 144, 'message': 'No status found with that ID.'}]
775096608509886464 _ [{'code': 144, 'message': 'No status found with that ID.'}]
Rate limit reached. Sleeping for: 732
754011816964026368 _ [{'code': 144, 'message': 'No status found with that ID.'}]
Rate limit reached. Sleeping for: 731
In [5]:
# Read the tweet list file into a dataframe
tweet_df = pd.read_csv('tweet_list.txt', encoding = 'utf-8')

data gathering section ends...

data assessment section begins..

In [8]:
#Display archive dataframe for eye-ball check

archive_df
Out[8]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... 13 10 Phineas None None None None
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... 13 10 Tilly None None None None
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... 12 10 Archie None None None None
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... 13 10 Darla None None None None
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... 12 10 Franklin None None None None
5 891087950875897856 NaN NaN 2017-07-29 00:08:17 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a majestic great white breaching ... NaN NaN NaN https://twitter.com/dog_rates/status/891087950... 13 10 None None None None None
6 890971913173991426 NaN NaN 2017-07-28 16:27:12 +0000 <a href="http://twitter.com/download/iphone" r... Meet Jax. He enjoys ice cream so much he gets ... NaN NaN NaN https://gofundme.com/ydvmve-surgery-for-jax,ht... 13 10 Jax None None None None
7 890729181411237888 NaN NaN 2017-07-28 00:22:40 +0000 <a href="http://twitter.com/download/iphone" r... When you watch your owner call another dog a g... NaN NaN NaN https://twitter.com/dog_rates/status/890729181... 13 10 None None None None None
8 890609185150312448 NaN NaN 2017-07-27 16:25:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Zoey. She doesn't want to be one of th... NaN NaN NaN https://twitter.com/dog_rates/status/890609185... 13 10 Zoey None None None None
9 890240255349198849 NaN NaN 2017-07-26 15:59:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Cassie. She is a college pup. Studying... NaN NaN NaN https://twitter.com/dog_rates/status/890240255... 14 10 Cassie doggo None None None
10 890006608113172480 NaN NaN 2017-07-26 00:31:25 +0000 <a href="http://twitter.com/download/iphone" r... This is Koda. He is a South Australian decksha... NaN NaN NaN https://twitter.com/dog_rates/status/890006608... 13 10 Koda None None None None
11 889880896479866881 NaN NaN 2017-07-25 16:11:53 +0000 <a href="http://twitter.com/download/iphone" r... This is Bruno. He is a service shark. Only get... NaN NaN NaN https://twitter.com/dog_rates/status/889880896... 13 10 Bruno None None None None
12 889665388333682689 NaN NaN 2017-07-25 01:55:32 +0000 <a href="http://twitter.com/download/iphone" r... Here's a puppo that seems to be on the fence a... NaN NaN NaN https://twitter.com/dog_rates/status/889665388... 13 10 None None None None puppo
13 889638837579907072 NaN NaN 2017-07-25 00:10:02 +0000 <a href="http://twitter.com/download/iphone" r... This is Ted. He does his best. Sometimes that'... NaN NaN NaN https://twitter.com/dog_rates/status/889638837... 12 10 Ted None None None None
14 889531135344209921 NaN NaN 2017-07-24 17:02:04 +0000 <a href="http://twitter.com/download/iphone" r... This is Stuart. He's sporting his favorite fan... NaN NaN NaN https://twitter.com/dog_rates/status/889531135... 13 10 Stuart None None None puppo
15 889278841981685760 NaN NaN 2017-07-24 00:19:32 +0000 <a href="http://twitter.com/download/iphone" r... This is Oliver. You're witnessing one of his m... NaN NaN NaN https://twitter.com/dog_rates/status/889278841... 13 10 Oliver None None None None
16 888917238123831296 NaN NaN 2017-07-23 00:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This is Jim. He found a fren. Taught him how t... NaN NaN NaN https://twitter.com/dog_rates/status/888917238... 12 10 Jim None None None None
17 888804989199671297 NaN NaN 2017-07-22 16:56:37 +0000 <a href="http://twitter.com/download/iphone" r... This is Zeke. He has a new stick. Very proud o... NaN NaN NaN https://twitter.com/dog_rates/status/888804989... 13 10 Zeke None None None None
18 888554962724278272 NaN NaN 2017-07-22 00:23:06 +0000 <a href="http://twitter.com/download/iphone" r... This is Ralphus. He's powering up. Attempting ... NaN NaN NaN https://twitter.com/dog_rates/status/888554962... 13 10 Ralphus None None None None
19 888202515573088257 NaN NaN 2017-07-21 01:02:36 +0000 <a href="http://twitter.com/download/iphone" r... RT @dog_rates: This is Canela. She attempted s... 8.874740e+17 4.196984e+09 2017-07-19 00:47:34 +0000 https://twitter.com/dog_rates/status/887473957... 13 10 Canela None None None None
20 888078434458587136 NaN NaN 2017-07-20 16:49:33 +0000 <a href="http://twitter.com/download/iphone" r... This is Gerald. He was just told he didn't get... NaN NaN NaN https://twitter.com/dog_rates/status/888078434... 12 10 Gerald None None None None
21 887705289381826560 NaN NaN 2017-07-19 16:06:48 +0000 <a href="http://twitter.com/download/iphone" r... This is Jeffrey. He has a monopoly on the pool... NaN NaN NaN https://twitter.com/dog_rates/status/887705289... 13 10 Jeffrey None None None None
22 887517139158093824 NaN NaN 2017-07-19 03:39:09 +0000 <a href="http://twitter.com/download/iphone" r... I've yet to rate a Venezuelan Hover Wiener. Th... NaN NaN NaN https://twitter.com/dog_rates/status/887517139... 14 10 such None None None None
23 887473957103951883 NaN NaN 2017-07-19 00:47:34 +0000 <a href="http://twitter.com/download/iphone" r... This is Canela. She attempted some fancy porch... NaN NaN NaN https://twitter.com/dog_rates/status/887473957... 13 10 Canela None None None None
24 887343217045368832 NaN NaN 2017-07-18 16:08:03 +0000 <a href="http://twitter.com/download/iphone" r... You may not have known you needed to see this ... NaN NaN NaN https://twitter.com/dog_rates/status/887343217... 13 10 None None None None None
25 887101392804085760 NaN NaN 2017-07-18 00:07:08 +0000 <a href="http://twitter.com/download/iphone" r... This... is a Jubilant Antarctic House Bear. We... NaN NaN NaN https://twitter.com/dog_rates/status/887101392... 12 10 None None None None None
26 886983233522544640 NaN NaN 2017-07-17 16:17:36 +0000 <a href="http://twitter.com/download/iphone" r... This is Maya. She's very shy. Rarely leaves he... NaN NaN NaN https://twitter.com/dog_rates/status/886983233... 13 10 Maya None None None None
27 886736880519319552 NaN NaN 2017-07-16 23:58:41 +0000 <a href="http://twitter.com/download/iphone" r... This is Mingus. He's a wonderful father to his... NaN NaN NaN https://www.gofundme.com/mingusneedsus,https:/... 13 10 Mingus None None None None
28 886680336477933568 NaN NaN 2017-07-16 20:14:00 +0000 <a href="http://twitter.com/download/iphone" r... This is Derek. He's late for a dog meeting. 13... NaN NaN NaN https://twitter.com/dog_rates/status/886680336... 13 10 Derek None None None None
29 886366144734445568 NaN NaN 2017-07-15 23:25:31 +0000 <a href="http://twitter.com/download/iphone" r... This is Roscoe. Another pupper fallen victim t... NaN NaN NaN https://twitter.com/dog_rates/status/886366144... 12 10 Roscoe None None pupper None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2326 666411507551481857 NaN NaN 2015-11-17 00:24:19 +0000 <a href="http://twitter.com/download/iphone" r... This is quite the dog. Gets really excited whe... NaN NaN NaN https://twitter.com/dog_rates/status/666411507... 2 10 quite None None None None
2327 666407126856765440 NaN NaN 2015-11-17 00:06:54 +0000 <a href="http://twitter.com/download/iphone" r... This is a southern Vesuvius bumblegruff. Can d... NaN NaN NaN https://twitter.com/dog_rates/status/666407126... 7 10 a None None None None
2328 666396247373291520 NaN NaN 2015-11-16 23:23:41 +0000 <a href="http://twitter.com/download/iphone" r... Oh goodness. A super rare northeast Qdoba kang... NaN NaN NaN https://twitter.com/dog_rates/status/666396247... 9 10 None None None None None
2329 666373753744588802 NaN NaN 2015-11-16 21:54:18 +0000 <a href="http://twitter.com/download/iphone" r... Those are sunglasses and a jean jacket. 11/10 ... NaN NaN NaN https://twitter.com/dog_rates/status/666373753... 11 10 None None None None None
2330 666362758909284353 NaN NaN 2015-11-16 21:10:36 +0000 <a href="http://twitter.com/download/iphone" r... Unique dog here. Very small. Lives in containe... NaN NaN NaN https://twitter.com/dog_rates/status/666362758... 6 10 None None None None None
2331 666353288456101888 NaN NaN 2015-11-16 20:32:58 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a mixed Asiago from the GalƔpagos... NaN NaN NaN https://twitter.com/dog_rates/status/666353288... 8 10 None None None None None
2332 666345417576210432 NaN NaN 2015-11-16 20:01:42 +0000 <a href="http://twitter.com/download/iphone" r... Look at this jokester thinking seat belt laws ... NaN NaN NaN https://twitter.com/dog_rates/status/666345417... 10 10 None None None None None
2333 666337882303524864 NaN NaN 2015-11-16 19:31:45 +0000 <a href="http://twitter.com/download/iphone" r... This is an extremely rare horned Parthenon. No... NaN NaN NaN https://twitter.com/dog_rates/status/666337882... 9 10 an None None None None
2334 666293911632134144 NaN NaN 2015-11-16 16:37:02 +0000 <a href="http://twitter.com/download/iphone" r... This is a funny dog. Weird toes. Won't come do... NaN NaN NaN https://twitter.com/dog_rates/status/666293911... 3 10 a None None None None
2335 666287406224695296 NaN NaN 2015-11-16 16:11:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an Albanian 3 1/2 legged Episcopalian... NaN NaN NaN https://twitter.com/dog_rates/status/666287406... 1 2 an None None None None
2336 666273097616637952 NaN NaN 2015-11-16 15:14:19 +0000 <a href="http://twitter.com/download/iphone" r... Can take selfies 11/10 https://t.co/ws2AMaNwPW NaN NaN NaN https://twitter.com/dog_rates/status/666273097... 11 10 None None None None None
2337 666268910803644416 NaN NaN 2015-11-16 14:57:41 +0000 <a href="http://twitter.com/download/iphone" r... Very concerned about fellow dog trapped in com... NaN NaN NaN https://twitter.com/dog_rates/status/666268910... 10 10 None None None None None
2338 666104133288665088 NaN NaN 2015-11-16 04:02:55 +0000 <a href="http://twitter.com/download/iphone" r... Not familiar with this breed. No tail (weird).... NaN NaN NaN https://twitter.com/dog_rates/status/666104133... 1 10 None None None None None
2339 666102155909144576 NaN NaN 2015-11-16 03:55:04 +0000 <a href="http://twitter.com/download/iphone" r... Oh my. Here you are seeing an Adobe Setter giv... NaN NaN NaN https://twitter.com/dog_rates/status/666102155... 11 10 None None None None None
2340 666099513787052032 NaN NaN 2015-11-16 03:44:34 +0000 <a href="http://twitter.com/download/iphone" r... Can stand on stump for what seems like a while... NaN NaN NaN https://twitter.com/dog_rates/status/666099513... 8 10 None None None None None
2341 666094000022159362 NaN NaN 2015-11-16 03:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This appears to be a Mongolian Presbyterian mi... NaN NaN NaN https://twitter.com/dog_rates/status/666094000... 9 10 None None None None None
2342 666082916733198337 NaN NaN 2015-11-16 02:38:37 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a well-established sunblockerspan... NaN NaN NaN https://twitter.com/dog_rates/status/666082916... 6 10 None None None None None
2343 666073100786774016 NaN NaN 2015-11-16 01:59:36 +0000 <a href="http://twitter.com/download/iphone" r... Let's hope this flight isn't Malaysian (lol). ... NaN NaN NaN https://twitter.com/dog_rates/status/666073100... 10 10 None None None None None
2344 666071193221509120 NaN NaN 2015-11-16 01:52:02 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a northern speckled Rhododendron.... NaN NaN NaN https://twitter.com/dog_rates/status/666071193... 9 10 None None None None None
2345 666063827256086533 NaN NaN 2015-11-16 01:22:45 +0000 <a href="http://twitter.com/download/iphone" r... This is the happiest dog you will ever see. Ve... NaN NaN NaN https://twitter.com/dog_rates/status/666063827... 10 10 the None None None None
2346 666058600524156928 NaN NaN 2015-11-16 01:01:59 +0000 <a href="http://twitter.com/download/iphone" r... Here is the Rand Paul of retrievers folks! He'... NaN NaN NaN https://twitter.com/dog_rates/status/666058600... 8 10 the None None None None
2347 666057090499244032 NaN NaN 2015-11-16 00:55:59 +0000 <a href="http://twitter.com/download/iphone" r... My oh my. This is a rare blond Canadian terrie... NaN NaN NaN https://twitter.com/dog_rates/status/666057090... 9 10 a None None None None
2348 666055525042405380 NaN NaN 2015-11-16 00:49:46 +0000 <a href="http://twitter.com/download/iphone" r... Here is a Siberian heavily armored polar bear ... NaN NaN NaN https://twitter.com/dog_rates/status/666055525... 10 10 a None None None None
2349 666051853826850816 NaN NaN 2015-11-16 00:35:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an odd dog. Hard on the outside but lo... NaN NaN NaN https://twitter.com/dog_rates/status/666051853... 2 10 an None None None None
2350 666050758794694657 NaN NaN 2015-11-16 00:30:50 +0000 <a href="http://twitter.com/download/iphone" r... This is a truly beautiful English Wilson Staff... NaN NaN NaN https://twitter.com/dog_rates/status/666050758... 10 10 a None None None None
2351 666049248165822465 NaN NaN 2015-11-16 00:24:50 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a 1949 1st generation vulpix. Enj... NaN NaN NaN https://twitter.com/dog_rates/status/666049248... 5 10 None None None None None
2352 666044226329800704 NaN NaN 2015-11-16 00:04:52 +0000 <a href="http://twitter.com/download/iphone" r... This is a purebred Piers Morgan. Loves to Netf... NaN NaN NaN https://twitter.com/dog_rates/status/666044226... 6 10 a None None None None
2353 666033412701032449 NaN NaN 2015-11-15 23:21:54 +0000 <a href="http://twitter.com/download/iphone" r... Here is a very happy pup. Big fan of well-main... NaN NaN NaN https://twitter.com/dog_rates/status/666033412... 9 10 a None None None None
2354 666029285002620928 NaN NaN 2015-11-15 23:05:30 +0000 <a href="http://twitter.com/download/iphone" r... This is a western brown Mitsubishi terrier. Up... NaN NaN NaN https://twitter.com/dog_rates/status/666029285... 7 10 a None None None None
2355 666020888022790149 NaN NaN 2015-11-15 22:32:08 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a Japanese Irish Setter. Lost eye... NaN NaN NaN https://twitter.com/dog_rates/status/666020888... 8 10 None None None None None

2356 rows × 17 columns

In [9]:
#Display image dataframe for eye-ball check
images_df
Out[9]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True
2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German_shepherd 0.596461 True malinois 0.138584 True bloodhound 0.116197 True
3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian_ridgeback 0.408143 True redbone 0.360687 True miniature_pinscher 0.222752 True
4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 miniature_pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True
5 666050758794694657 https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg 1 Bernese_mountain_dog 0.651137 True English_springer 0.263788 True Greater_Swiss_Mountain_dog 0.016199 True
6 666051853826850816 https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg 1 box_turtle 0.933012 False mud_turtle 0.045885 False terrapin 0.017885 False
7 666055525042405380 https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg 1 chow 0.692517 True Tibetan_mastiff 0.058279 True fur_coat 0.054449 False
8 666057090499244032 https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg 1 shopping_cart 0.962465 False shopping_basket 0.014594 False golden_retriever 0.007959 True
9 666058600524156928 https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg 1 miniature_poodle 0.201493 True komondor 0.192305 True soft-coated_wheaten_terrier 0.082086 True
10 666063827256086533 https://pbs.twimg.com/media/CT5Vg_wXIAAXfnj.jpg 1 golden_retriever 0.775930 True Tibetan_mastiff 0.093718 True Labrador_retriever 0.072427 True
11 666071193221509120 https://pbs.twimg.com/media/CT5cN_3WEAAlOoZ.jpg 1 Gordon_setter 0.503672 True Yorkshire_terrier 0.174201 True Pekinese 0.109454 True
12 666073100786774016 https://pbs.twimg.com/media/CT5d9DZXAAALcwe.jpg 1 Walker_hound 0.260857 True English_foxhound 0.175382 True Ibizan_hound 0.097471 True
13 666082916733198337 https://pbs.twimg.com/media/CT5m4VGWEAAtKc8.jpg 1 pug 0.489814 True bull_mastiff 0.404722 True French_bulldog 0.048960 True
14 666094000022159362 https://pbs.twimg.com/media/CT5w9gUW4AAsBNN.jpg 1 bloodhound 0.195217 True German_shepherd 0.078260 True malinois 0.075628 True
15 666099513787052032 https://pbs.twimg.com/media/CT51-JJUEAA6hV8.jpg 1 Lhasa 0.582330 True Shih-Tzu 0.166192 True Dandie_Dinmont 0.089688 True
16 666102155909144576 https://pbs.twimg.com/media/CT54YGiWUAEZnoK.jpg 1 English_setter 0.298617 True Newfoundland 0.149842 True borzoi 0.133649 True
17 666104133288665088 https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg 1 hen 0.965932 False cock 0.033919 False partridge 0.000052 False
18 666268910803644416 https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg 1 desktop_computer 0.086502 False desk 0.085547 False bookcase 0.079480 False
19 666273097616637952 https://pbs.twimg.com/media/CT8T1mtUwAA3aqm.jpg 1 Italian_greyhound 0.176053 True toy_terrier 0.111884 True basenji 0.111152 True
20 666287406224695296 https://pbs.twimg.com/media/CT8g3BpUEAAuFjg.jpg 1 Maltese_dog 0.857531 True toy_poodle 0.063064 True miniature_poodle 0.025581 True
21 666293911632134144 https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg 1 three-toed_sloth 0.914671 False otter 0.015250 False great_grey_owl 0.013207 False
22 666337882303524864 https://pbs.twimg.com/media/CT9OwFIWEAMuRje.jpg 1 ox 0.416669 False Newfoundland 0.278407 True groenendael 0.102643 True
23 666345417576210432 https://pbs.twimg.com/media/CT9Vn7PWoAA_ZCM.jpg 1 golden_retriever 0.858744 True Chesapeake_Bay_retriever 0.054787 True Labrador_retriever 0.014241 True
24 666353288456101888 https://pbs.twimg.com/media/CT9cx0tUEAAhNN_.jpg 1 malamute 0.336874 True Siberian_husky 0.147655 True Eskimo_dog 0.093412 True
25 666362758909284353 https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg 1 guinea_pig 0.996496 False skunk 0.002402 False hamster 0.000461 False
26 666373753744588802 https://pbs.twimg.com/media/CT9vZEYWUAAlZ05.jpg 1 soft-coated_wheaten_terrier 0.326467 True Afghan_hound 0.259551 True briard 0.206803 True
27 666396247373291520 https://pbs.twimg.com/media/CT-D2ZHWIAA3gK1.jpg 1 Chihuahua 0.978108 True toy_terrier 0.009397 True papillon 0.004577 True
28 666407126856765440 https://pbs.twimg.com/media/CT-NvwmW4AAugGZ.jpg 1 black-and-tan_coonhound 0.529139 True bloodhound 0.244220 True flat-coated_retriever 0.173810 True
29 666411507551481857 https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg 1 coho 0.404640 False barracouta 0.271485 False gar 0.189945 False
... ... ... ... ... ... ... ... ... ... ... ... ...
2045 886366144734445568 https://pbs.twimg.com/media/DE0BTnQUwAApKEH.jpg 1 French_bulldog 0.999201 True Chihuahua 0.000361 True Boston_bull 0.000076 True
2046 886680336477933568 https://pbs.twimg.com/media/DE4fEDzWAAAyHMM.jpg 1 convertible 0.738995 False sports_car 0.139952 False car_wheel 0.044173 False
2047 886736880519319552 https://pbs.twimg.com/media/DE5Se8FXcAAJFx4.jpg 1 kuvasz 0.309706 True Great_Pyrenees 0.186136 True Dandie_Dinmont 0.086346 True
2048 886983233522544640 https://pbs.twimg.com/media/DE8yicJW0AAAvBJ.jpg 2 Chihuahua 0.793469 True toy_terrier 0.143528 True can_opener 0.032253 False
2049 887101392804085760 https://pbs.twimg.com/media/DE-eAq6UwAA-jaE.jpg 1 Samoyed 0.733942 True Eskimo_dog 0.035029 True Staffordshire_bullterrier 0.029705 True
2050 887343217045368832 https://pbs.twimg.com/ext_tw_video_thumb/88734... 1 Mexican_hairless 0.330741 True sea_lion 0.275645 False Weimaraner 0.134203 True
2051 887473957103951883 https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg 2 Pembroke 0.809197 True Rhodesian_ridgeback 0.054950 True beagle 0.038915 True
2052 887517139158093824 https://pbs.twimg.com/ext_tw_video_thumb/88751... 1 limousine 0.130432 False tow_truck 0.029175 False shopping_cart 0.026321 False
2053 887705289381826560 https://pbs.twimg.com/media/DFHDQBbXgAEqY7t.jpg 1 basset 0.821664 True redbone 0.087582 True Weimaraner 0.026236 True
2054 888078434458587136 https://pbs.twimg.com/media/DFMWn56WsAAkA7B.jpg 1 French_bulldog 0.995026 True pug 0.000932 True bull_mastiff 0.000903 True
2055 888202515573088257 https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg 2 Pembroke 0.809197 True Rhodesian_ridgeback 0.054950 True beagle 0.038915 True
2056 888554962724278272 https://pbs.twimg.com/media/DFTH_O-UQAACu20.jpg 3 Siberian_husky 0.700377 True Eskimo_dog 0.166511 True malamute 0.111411 True
2057 888804989199671297 https://pbs.twimg.com/media/DFWra-3VYAA2piG.jpg 1 golden_retriever 0.469760 True Labrador_retriever 0.184172 True English_setter 0.073482 True
2058 888917238123831296 https://pbs.twimg.com/media/DFYRgsOUQAARGhO.jpg 1 golden_retriever 0.714719 True Tibetan_mastiff 0.120184 True Labrador_retriever 0.105506 True
2059 889278841981685760 https://pbs.twimg.com/ext_tw_video_thumb/88927... 1 whippet 0.626152 True borzoi 0.194742 True Saluki 0.027351 True
2060 889531135344209921 https://pbs.twimg.com/media/DFg_2PVW0AEHN3p.jpg 1 golden_retriever 0.953442 True Labrador_retriever 0.013834 True redbone 0.007958 True
2061 889638837579907072 https://pbs.twimg.com/media/DFihzFfXsAYGDPR.jpg 1 French_bulldog 0.991650 True boxer 0.002129 True Staffordshire_bullterrier 0.001498 True
2062 889665388333682689 https://pbs.twimg.com/media/DFi579UWsAAatzw.jpg 1 Pembroke 0.966327 True Cardigan 0.027356 True basenji 0.004633 True
2063 889880896479866881 https://pbs.twimg.com/media/DFl99B1WsAITKsg.jpg 1 French_bulldog 0.377417 True Labrador_retriever 0.151317 True muzzle 0.082981 False
2064 890006608113172480 https://pbs.twimg.com/media/DFnwSY4WAAAMliS.jpg 1 Samoyed 0.957979 True Pomeranian 0.013884 True chow 0.008167 True
2065 890240255349198849 https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg 1 Pembroke 0.511319 True Cardigan 0.451038 True Chihuahua 0.029248 True
2066 890609185150312448 https://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg 1 Irish_terrier 0.487574 True Irish_setter 0.193054 True Chesapeake_Bay_retriever 0.118184 True
2067 890729181411237888 https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg 2 Pomeranian 0.566142 True Eskimo_dog 0.178406 True Pembroke 0.076507 True
2068 890971913173991426 https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg 1 Appenzeller 0.341703 True Border_collie 0.199287 True ice_lolly 0.193548 False
2069 891087950875897856 https://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg 1 Chesapeake_Bay_retriever 0.425595 True Irish_terrier 0.116317 True Indian_elephant 0.076902 False
2070 891327558926688256 https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2 basset 0.555712 True English_springer 0.225770 True German_short-haired_pointer 0.175219 True
2071 891689557279858688 https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1 paper_towel 0.170278 False Labrador_retriever 0.168086 True spatula 0.040836 False
2072 891815181378084864 https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1 Chihuahua 0.716012 True malamute 0.078253 True kelpie 0.031379 True
2073 892177421306343426 https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1 Chihuahua 0.323581 True Pekinese 0.090647 True papillon 0.068957 True
2074 892420643555336193 https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1 orange 0.097049 False bagel 0.085851 False banana 0.076110 False

2075 rows × 12 columns

In [10]:
#Display  tweet dataframe for eye-ball check
tweet_df
Out[10]:
tweet_id favorites retweets user_followers user_favourites date_time
0 892420643555336193 38680 8559 6997101 134700 2017-08-01 16:23:56
1 892177421306343426 33156 6291 6997101 134700 2017-08-01 00:17:27
2 891815181378084864 24958 4170 6997101 134700 2017-07-31 00:18:03
3 891689557279858688 42064 8687 6997101 134700 2017-07-30 15:58:51
4 891327558926688256 40212 9447 6997101 134700 2017-07-29 16:00:24
5 891087950875897856 20163 3127 6997101 134700 2017-07-29 00:08:17
6 890971913173991426 11816 2082 6997101 134700 2017-07-28 16:27:12
7 890729181411237888 65338 18971 6997101 134700 2017-07-28 00:22:40
8 890609185150312448 27713 4279 6997101 134700 2017-07-27 16:25:51
9 890240255349198849 31857 7448 6997101 134700 2017-07-26 15:59:51
10 890006608113172480 30583 7363 6997101 134700 2017-07-26 00:31:25
11 889880896479866881 27719 4989 6997101 134700 2017-07-25 16:11:53
12 889665388333682689 48003 10101 6997101 134700 2017-07-25 01:55:32
13 889638837579907072 27103 4564 6997101 134700 2017-07-25 00:10:02
14 889531135344209921 15052 2244 6997101 134700 2017-07-24 17:02:04
15 889278841981685760 25236 5446 6997101 134700 2017-07-24 00:19:32
16 888917238123831296 28996 4517 6997101 134700 2017-07-23 00:22:39
17 888804989199671297 25527 4363 6997101 134700 2017-07-22 16:56:37
18 888554962724278272 19850 3596 6997101 134700 2017-07-22 00:23:06
19 888078434458587136 21702 3511 6997101 134700 2017-07-20 16:49:33
20 887705289381826560 30108 5414 6997101 134700 2017-07-19 16:06:48
21 887517139158093824 46119 11713 6997101 134700 2017-07-19 03:39:09
22 887473957103951883 68935 18305 6997101 134700 2017-07-19 00:47:34
23 887343217045368832 33582 10446 6997101 134700 2017-07-18 16:08:03
24 887101392804085760 30451 5984 6997101 134700 2017-07-18 00:07:08
25 886983233522544640 35058 7806 6997101 134700 2017-07-17 16:17:36
26 886736880519319552 12037 3311 6997101 134700 2017-07-16 23:58:41
27 886680336477933568 22362 4486 6997101 134700 2017-07-16 20:14:00
28 886366144734445568 21142 3208 6997102 134700 2017-07-15 23:25:31
29 886267009285017600 116 4 6997102 134700 2017-07-15 16:51:35
... ... ... ... ... ... ...
2314 666411507551481857 447 328 6997189 134700 2015-11-17 00:24:19
2315 666407126856765440 110 41 6997189 134700 2015-11-17 00:06:54
2316 666396247373291520 167 86 6997189 134700 2015-11-16 23:23:41
2317 666373753744588802 190 93 6997189 134700 2015-11-16 21:54:18
2318 666362758909284353 778 574 6997189 134700 2015-11-16 21:10:36
2319 666353288456101888 221 73 6997190 134700 2015-11-16 20:32:58
2320 666345417576210432 299 139 6997190 134700 2015-11-16 20:01:42
2321 666337882303524864 198 92 6997190 134700 2015-11-16 19:31:45
2322 666293911632134144 509 357 6997190 134700 2015-11-16 16:37:02
2323 666287406224695296 149 66 6997190 134700 2015-11-16 16:11:11
2324 666273097616637952 176 76 6997190 134700 2015-11-16 15:14:19
2325 666268910803644416 104 35 6997190 134700 2015-11-16 14:57:41
2326 666104133288665088 14346 6634 6997190 134700 2015-11-16 04:02:55
2327 666102155909144576 80 13 6997190 134700 2015-11-16 03:55:04
2328 666099513787052032 156 68 6997190 134700 2015-11-16 03:44:34
2329 666094000022159362 164 74 6997190 134700 2015-11-16 03:22:39
2330 666082916733198337 119 45 6997190 134700 2015-11-16 02:38:37
2331 666073100786774016 322 164 6997190 134700 2015-11-16 01:59:36
2332 666071193221509120 148 62 6997190 134700 2015-11-16 01:52:02
2333 666063827256086533 476 219 6997190 134700 2015-11-16 01:22:45
2334 666058600524156928 112 57 6997190 134700 2015-11-16 01:01:59
2335 666057090499244032 298 142 6997190 134700 2015-11-16 00:55:59
2336 666055525042405380 434 252 6997190 134700 2015-11-16 00:49:46
2337 666051853826850816 1223 853 6997190 134700 2015-11-16 00:35:11
2338 666050758794694657 132 58 6997190 134700 2015-11-16 00:30:50
2339 666049248165822465 109 41 6997190 134700 2015-11-16 00:24:50
2340 666044226329800704 299 141 6997190 134700 2015-11-16 00:04:52
2341 666033412701032449 125 44 6997190 134700 2015-11-15 23:21:54
2342 666029285002620928 129 47 6997190 134700 2015-11-15 23:05:30
2343 666020888022790149 2560 517 6997190 134700 2015-11-15 22:32:08

2344 rows × 6 columns

In [11]:
#random check on all 3 data sources...
# Archive dataframe
print(archive_df['text'][100])
print(archive_df['expanded_urls'][200])

# image dataframe
print(images_df['p1'][50])
print(images_df['p2'][150])

# tweet data dataframe
print(tweet_df['tweet_id'][50])
print(tweet_df['date_time'][150])
Here are my favorite #dogsatpollingstations 
Most voted for a more consistent walking schedule and to increase daily pats tenfold. All 13/10 https://t.co/17FVMl4VZ5
https://twitter.com/dog_rates/status/854010172552949760/photo/1,https://twitter.com/dog_rates/status/854010172552949760/photo/1
triceratops
pug
882268110199369728
2017-05-10 00:08:34
In [12]:
# Assess archive data programatically
archive_df.info()
archive_df.describe()
archive_df['rating_numerator'].value_counts()
archive_df['rating_denominator'].value_counts()
archive_df['name'].value_counts()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB
Out[12]:
None              745
a                  55
Charlie            12
Cooper             11
Lucy               11
Oliver             11
Tucker             10
Lola               10
Penny              10
Winston             9
Bo                  9
Sadie               8
the                 8
Bailey              7
Toby                7
Daisy               7
an                  7
Buddy               7
Leo                 6
Jax                 6
Rusty               6
Milo                6
Jack                6
Dave                6
Oscar               6
Koda                6
Bella               6
Scout               6
Stanley             6
Larry               5
                 ... 
Wiggles             1
Comet               1
Monkey              1
Millie              1
Shikha              1
Linus               1
Schnitzel           1
Burt                1
Willy               1
Napolean            1
Ronduh              1
Miguel              1
Karma               1
Skittles            1
Jerome              1
Jennifur            1
Flash               1
Danny               1
Ralphson            1
Alejandro           1
Jomathan            1
Fiji                1
Livvie              1
Kaiya               1
Michelangelope      1
Cedrick             1
his                 1
Ed                  1
Brandi              1
Rumble              1
Name: name, Length: 957, dtype: int64
In [13]:
# Assess image data programatically
images_df.info()
images_df.describe()
images_df['tweet_id'].value_counts()
images_df['jpg_url'].value_counts()
images_df['p1'].value_counts()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
Out[13]:
golden_retriever             150
Labrador_retriever           100
Pembroke                      89
Chihuahua                     83
pug                           57
chow                          44
Samoyed                       43
toy_poodle                    39
Pomeranian                    38
malamute                      30
cocker_spaniel                30
French_bulldog                26
Chesapeake_Bay_retriever      23
miniature_pinscher            23
seat_belt                     22
Staffordshire_bullterrier     20
German_shepherd               20
Siberian_husky                20
Cardigan                      19
web_site                      19
Eskimo_dog                    18
teddy                         18
Maltese_dog                   18
Shetland_sheepdog             18
beagle                        18
Rottweiler                    17
Shih-Tzu                      17
Lakeland_terrier              17
kuvasz                        16
Italian_greyhound             16
                            ... 
alp                            1
lacewing                       1
water_bottle                   1
flamingo                       1
piggy_bank                     1
wild_boar                      1
lorikeet                       1
boathouse                      1
stove                          1
ibex                           1
hummingbird                    1
four-poster                    1
Scotch_terrier                 1
hotdog                         1
prayer_rug                     1
rapeseed                       1
guenon                         1
electric_fan                   1
hammer                         1
teapot                         1
military_uniform               1
scorpion                       1
African_crocodile              1
sea_urchin                     1
pitcher                        1
lynx                           1
envelope                       1
banana                         1
standard_schnauzer             1
ice_lolly                      1
Name: p1, Length: 378, dtype: int64
In [14]:
# Assess tweet data programatically
tweet_df.info()
tweet_df.describe()
tweet_df['tweet_id'].value_counts()
tweet_df['user_followers'].value_counts()
tweet_df['user_favourites'].value_counts()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2344 entries, 0 to 2343
Data columns (total 6 columns):
tweet_id           2344 non-null int64
favorites          2344 non-null int64
retweets           2344 non-null int64
user_followers     2344 non-null int64
user_favourites    2344 non-null int64
date_time          2344 non-null object
dtypes: int64(5), object(1)
memory usage: 110.0+ KB
Out[14]:
134700    2326
134701      18
Name: user_favourites, dtype: int64

Assessment section ends..

Cleaning section begins

Create copies of each dataframes before actual cleaning process.

In [15]:
archive_df_copy = archive_df.copy()
images_df_copy = images_df.copy()
tweet_df_copy = tweet_df.copy()
In [16]:
#verify copy
archive_df_copy.head()
images_df_copy.head()
tweet_df_copy.head()
Out[16]:
tweet_id favorites retweets user_followers user_favourites date_time
0 892420643555336193 38680 8559 6997101 134700 2017-08-01 16:23:56
1 892177421306343426 33156 6291 6997101 134700 2017-08-01 00:17:27
2 891815181378084864 24958 4170 6997101 134700 2017-07-31 00:18:03
3 891689557279858688 42064 8687 6997101 134700 2017-07-30 15:58:51
4 891327558926688256 40212 9447 6997101 134700 2017-07-29 16:00:24

data copying ends....

Define:


Merge 3 data sources into a single dataframe and then save it to a CSV file. Use tweet_id as key to merge. This is a tidyness issue.

In [77]:
#Code

df_master = pd.merge(archive_df, images_df, how = 'inner', on = ['tweet_id'] )
df_master = pd.merge(df_master, tweet_df, how = 'inner', on = ['tweet_id'])
df_master.to_csv('df_master.csv', encoding = 'utf-8')
In [78]:
#Test
df_master.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2068 entries, 0 to 2067
Data columns (total 33 columns):
tweet_id                      2068 non-null int64
in_reply_to_status_id         23 non-null float64
in_reply_to_user_id           23 non-null float64
timestamp                     2068 non-null object
source                        2068 non-null object
text                          2068 non-null object
retweeted_status_id           75 non-null float64
retweeted_status_user_id      75 non-null float64
retweeted_status_timestamp    75 non-null object
expanded_urls                 2068 non-null object
rating_numerator              2068 non-null int64
rating_denominator            2068 non-null int64
name                          2068 non-null object
doggo                         2068 non-null object
floofer                       2068 non-null object
pupper                        2068 non-null object
puppo                         2068 non-null object
jpg_url                       2068 non-null object
img_num                       2068 non-null int64
p1                            2068 non-null object
p1_conf                       2068 non-null float64
p1_dog                        2068 non-null bool
p2                            2068 non-null object
p2_conf                       2068 non-null float64
p2_dog                        2068 non-null bool
p3                            2068 non-null object
p3_conf                       2068 non-null float64
p3_dog                        2068 non-null bool
favorites                     2068 non-null int64
retweets                      2068 non-null int64
user_followers                2068 non-null int64
user_favourites               2068 non-null int64
date_time                     2068 non-null object
dtypes: bool(3), float64(7), int64(8), object(15)
memory usage: 506.9+ KB
In [79]:
#Test
#count before deletion
len(df_master)
Out[79]:
2068

Define

Delete Duplicate rows.

In [80]:
#Code
df_master = df_master.drop_duplicates()
In [81]:
#Test
#check for duplicates after deleting duplicates.. A value of zero rows suggests no duplicates any more..
df_master[df_master.duplicated(keep=False)]
Out[81]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls ... p2_conf p2_dog p3 p3_conf p3_dog favorites retweets user_followers user_favourites date_time

0 rows × 33 columns

Define

Delete retweet rows.. Delete rows where retweeted_status_id, retweeted_status_user_id are not NaN

In [84]:
#Code
some_values = ['NaN']

df_master = df_master.loc[df_master['retweeted_status_id'].isin(some_values)]
df_master = df_master.loc[df_master['retweeted_status_user_id'].isin(some_values)]
In [85]:
#test
#check whether retweet rows filtered out
df_master
Out[85]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls ... p2_conf p2_dog p3 p3_conf p3_dog favorites retweets user_followers user_favourites date_time
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... ... 0.085851 False banana 0.076110 False 38680 8559 6997101 134700 2017-08-01 16:23:56
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... ... 0.090647 True papillon 0.068957 True 33156 6291 6997101 134700 2017-08-01 00:17:27
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... ... 0.078253 True kelpie 0.031379 True 24958 4170 6997101 134700 2017-07-31 00:18:03
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... ... 0.168086 True spatula 0.040836 False 42064 8687 6997101 134700 2017-07-30 15:58:51
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... ... 0.225770 True German_short-haired_pointer 0.175219 True 40212 9447 6997101 134700 2017-07-29 16:00:24
5 891087950875897856 NaN NaN 2017-07-29 00:08:17 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a majestic great white breaching ... NaN NaN NaN https://twitter.com/dog_rates/status/891087950... ... 0.116317 True Indian_elephant 0.076902 False 20163 3127 6997101 134700 2017-07-29 00:08:17
6 890971913173991426 NaN NaN 2017-07-28 16:27:12 +0000 <a href="http://twitter.com/download/iphone" r... Meet Jax. He enjoys ice cream so much he gets ... NaN NaN NaN https://gofundme.com/ydvmve-surgery-for-jax,ht... ... 0.199287 True ice_lolly 0.193548 False 11816 2082 6997101 134700 2017-07-28 16:27:12
7 890729181411237888 NaN NaN 2017-07-28 00:22:40 +0000 <a href="http://twitter.com/download/iphone" r... When you watch your owner call another dog a g... NaN NaN NaN https://twitter.com/dog_rates/status/890729181... ... 0.178406 True Pembroke 0.076507 True 65338 18971 6997101 134700 2017-07-28 00:22:40
8 890609185150312448 NaN NaN 2017-07-27 16:25:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Zoey. She doesn't want to be one of th... NaN NaN NaN https://twitter.com/dog_rates/status/890609185... ... 0.193054 True Chesapeake_Bay_retriever 0.118184 True 27713 4279 6997101 134700 2017-07-27 16:25:51
9 890240255349198849 NaN NaN 2017-07-26 15:59:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Cassie. She is a college pup. Studying... NaN NaN NaN https://twitter.com/dog_rates/status/890240255... ... 0.451038 True Chihuahua 0.029248 True 31857 7448 6997101 134700 2017-07-26 15:59:51
10 890006608113172480 NaN NaN 2017-07-26 00:31:25 +0000 <a href="http://twitter.com/download/iphone" r... This is Koda. He is a South Australian decksha... NaN NaN NaN https://twitter.com/dog_rates/status/890006608... ... 0.013884 True chow 0.008167 True 30583 7363 6997101 134700 2017-07-26 00:31:25
11 889880896479866881 NaN NaN 2017-07-25 16:11:53 +0000 <a href="http://twitter.com/download/iphone" r... This is Bruno. He is a service shark. Only get... NaN NaN NaN https://twitter.com/dog_rates/status/889880896... ... 0.151317 True muzzle 0.082981 False 27719 4989 6997101 134700 2017-07-25 16:11:53
12 889665388333682689 NaN NaN 2017-07-25 01:55:32 +0000 <a href="http://twitter.com/download/iphone" r... Here's a puppo that seems to be on the fence a... NaN NaN NaN https://twitter.com/dog_rates/status/889665388... ... 0.027356 True basenji 0.004633 True 48003 10101 6997101 134700 2017-07-25 01:55:32
13 889638837579907072 NaN NaN 2017-07-25 00:10:02 +0000 <a href="http://twitter.com/download/iphone" r... This is Ted. He does his best. Sometimes that'... NaN NaN NaN https://twitter.com/dog_rates/status/889638837... ... 0.002129 True Staffordshire_bullterrier 0.001498 True 27103 4564 6997101 134700 2017-07-25 00:10:02
14 889531135344209921 NaN NaN 2017-07-24 17:02:04 +0000 <a href="http://twitter.com/download/iphone" r... This is Stuart. He's sporting his favorite fan... NaN NaN NaN https://twitter.com/dog_rates/status/889531135... ... 0.013834 True redbone 0.007958 True 15052 2244 6997101 134700 2017-07-24 17:02:04
15 889278841981685760 NaN NaN 2017-07-24 00:19:32 +0000 <a href="http://twitter.com/download/iphone" r... This is Oliver. You're witnessing one of his m... NaN NaN NaN https://twitter.com/dog_rates/status/889278841... ... 0.194742 True Saluki 0.027351 True 25236 5446 6997101 134700 2017-07-24 00:19:32
16 888917238123831296 NaN NaN 2017-07-23 00:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This is Jim. He found a fren. Taught him how t... NaN NaN NaN https://twitter.com/dog_rates/status/888917238... ... 0.120184 True Labrador_retriever 0.105506 True 28996 4517 6997101 134700 2017-07-23 00:22:39
17 888804989199671297 NaN NaN 2017-07-22 16:56:37 +0000 <a href="http://twitter.com/download/iphone" r... This is Zeke. He has a new stick. Very proud o... NaN NaN NaN https://twitter.com/dog_rates/status/888804989... ... 0.184172 True English_setter 0.073482 True 25527 4363 6997101 134700 2017-07-22 16:56:37
18 888554962724278272 NaN NaN 2017-07-22 00:23:06 +0000 <a href="http://twitter.com/download/iphone" r... This is Ralphus. He's powering up. Attempting ... NaN NaN NaN https://twitter.com/dog_rates/status/888554962... ... 0.166511 True malamute 0.111411 True 19850 3596 6997101 134700 2017-07-22 00:23:06
19 888078434458587136 NaN NaN 2017-07-20 16:49:33 +0000 <a href="http://twitter.com/download/iphone" r... This is Gerald. He was just told he didn't get... NaN NaN NaN https://twitter.com/dog_rates/status/888078434... ... 0.000932 True bull_mastiff 0.000903 True 21702 3511 6997101 134700 2017-07-20 16:49:33
20 887705289381826560 NaN NaN 2017-07-19 16:06:48 +0000 <a href="http://twitter.com/download/iphone" r... This is Jeffrey. He has a monopoly on the pool... NaN NaN NaN https://twitter.com/dog_rates/status/887705289... ... 0.087582 True Weimaraner 0.026236 True 30108 5414 6997101 134700 2017-07-19 16:06:48
21 887517139158093824 NaN NaN 2017-07-19 03:39:09 +0000 <a href="http://twitter.com/download/iphone" r... I've yet to rate a Venezuelan Hover Wiener. Th... NaN NaN NaN https://twitter.com/dog_rates/status/887517139... ... 0.029175 False shopping_cart 0.026321 False 46119 11713 6997101 134700 2017-07-19 03:39:09
22 887473957103951883 NaN NaN 2017-07-19 00:47:34 +0000 <a href="http://twitter.com/download/iphone" r... This is Canela. She attempted some fancy porch... NaN NaN NaN https://twitter.com/dog_rates/status/887473957... ... 0.054950 True beagle 0.038915 True 68935 18305 6997101 134700 2017-07-19 00:47:34
23 887343217045368832 NaN NaN 2017-07-18 16:08:03 +0000 <a href="http://twitter.com/download/iphone" r... You may not have known you needed to see this ... NaN NaN NaN https://twitter.com/dog_rates/status/887343217... ... 0.275645 False Weimaraner 0.134203 True 33582 10446 6997101 134700 2017-07-18 16:08:03
24 887101392804085760 NaN NaN 2017-07-18 00:07:08 +0000 <a href="http://twitter.com/download/iphone" r... This... is a Jubilant Antarctic House Bear. We... NaN NaN NaN https://twitter.com/dog_rates/status/887101392... ... 0.035029 True Staffordshire_bullterrier 0.029705 True 30451 5984 6997101 134700 2017-07-18 00:07:08
25 886983233522544640 NaN NaN 2017-07-17 16:17:36 +0000 <a href="http://twitter.com/download/iphone" r... This is Maya. She's very shy. Rarely leaves he... NaN NaN NaN https://twitter.com/dog_rates/status/886983233... ... 0.143528 True can_opener 0.032253 False 35058 7806 6997101 134700 2017-07-17 16:17:36
26 886736880519319552 NaN NaN 2017-07-16 23:58:41 +0000 <a href="http://twitter.com/download/iphone" r... This is Mingus. He's a wonderful father to his... NaN NaN NaN https://www.gofundme.com/mingusneedsus,https:/... ... 0.186136 True Dandie_Dinmont 0.086346 True 12037 3311 6997101 134700 2017-07-16 23:58:41
27 886680336477933568 NaN NaN 2017-07-16 20:14:00 +0000 <a href="http://twitter.com/download/iphone" r... This is Derek. He's late for a dog meeting. 13... NaN NaN NaN https://twitter.com/dog_rates/status/886680336... ... 0.139952 False car_wheel 0.044173 False 22362 4486 6997101 134700 2017-07-16 20:14:00
28 886366144734445568 NaN NaN 2017-07-15 23:25:31 +0000 <a href="http://twitter.com/download/iphone" r... This is Roscoe. Another pupper fallen victim t... NaN NaN NaN https://twitter.com/dog_rates/status/886366144... ... 0.000361 True Boston_bull 0.000076 True 21142 3208 6997102 134700 2017-07-15 23:25:31
29 886258384151887873 NaN NaN 2017-07-15 16:17:19 +0000 <a href="http://twitter.com/download/iphone" r... This is Waffles. His doggles are pupside down.... NaN NaN NaN https://twitter.com/dog_rates/status/886258384... ... 0.025286 False Siamese_cat 0.002849 False 27905 6316 6997102 134700 2017-07-15 16:17:19
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2038 666411507551481857 NaN NaN 2015-11-17 00:24:19 +0000 <a href="http://twitter.com/download/iphone" r... This is quite the dog. Gets really excited whe... NaN NaN NaN https://twitter.com/dog_rates/status/666411507... ... 0.271485 False gar 0.189945 False 447 328 6997189 134700 2015-11-17 00:24:19
2039 666407126856765440 NaN NaN 2015-11-17 00:06:54 +0000 <a href="http://twitter.com/download/iphone" r... This is a southern Vesuvius bumblegruff. Can d... NaN NaN NaN https://twitter.com/dog_rates/status/666407126... ... 0.244220 True flat-coated_retriever 0.173810 True 110 41 6997189 134700 2015-11-17 00:06:54
2040 666396247373291520 NaN NaN 2015-11-16 23:23:41 +0000 <a href="http://twitter.com/download/iphone" r... Oh goodness. A super rare northeast Qdoba kang... NaN NaN NaN https://twitter.com/dog_rates/status/666396247... ... 0.009397 True papillon 0.004577 True 167 86 6997189 134700 2015-11-16 23:23:41
2041 666373753744588802 NaN NaN 2015-11-16 21:54:18 +0000 <a href="http://twitter.com/download/iphone" r... Those are sunglasses and a jean jacket. 11/10 ... NaN NaN NaN https://twitter.com/dog_rates/status/666373753... ... 0.259551 True briard 0.206803 True 190 93 6997189 134700 2015-11-16 21:54:18
2042 666362758909284353 NaN NaN 2015-11-16 21:10:36 +0000 <a href="http://twitter.com/download/iphone" r... Unique dog here. Very small. Lives in containe... NaN NaN NaN https://twitter.com/dog_rates/status/666362758... ... 0.002402 False hamster 0.000461 False 778 574 6997189 134700 2015-11-16 21:10:36
2043 666353288456101888 NaN NaN 2015-11-16 20:32:58 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a mixed Asiago from the GalƔpagos... NaN NaN NaN https://twitter.com/dog_rates/status/666353288... ... 0.147655 True Eskimo_dog 0.093412 True 221 73 6997190 134700 2015-11-16 20:32:58
2044 666345417576210432 NaN NaN 2015-11-16 20:01:42 +0000 <a href="http://twitter.com/download/iphone" r... Look at this jokester thinking seat belt laws ... NaN NaN NaN https://twitter.com/dog_rates/status/666345417... ... 0.054787 True Labrador_retriever 0.014241 True 299 139 6997190 134700 2015-11-16 20:01:42
2045 666337882303524864 NaN NaN 2015-11-16 19:31:45 +0000 <a href="http://twitter.com/download/iphone" r... This is an extremely rare horned Parthenon. No... NaN NaN NaN https://twitter.com/dog_rates/status/666337882... ... 0.278407 True groenendael 0.102643 True 198 92 6997190 134700 2015-11-16 19:31:45
2046 666293911632134144 NaN NaN 2015-11-16 16:37:02 +0000 <a href="http://twitter.com/download/iphone" r... This is a funny dog. Weird toes. Won't come do... NaN NaN NaN https://twitter.com/dog_rates/status/666293911... ... 0.015250 False great_grey_owl 0.013207 False 509 357 6997190 134700 2015-11-16 16:37:02
2047 666287406224695296 NaN NaN 2015-11-16 16:11:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an Albanian 3 1/2 legged Episcopalian... NaN NaN NaN https://twitter.com/dog_rates/status/666287406... ... 0.063064 True miniature_poodle 0.025581 True 149 66 6997190 134700 2015-11-16 16:11:11
2048 666273097616637952 NaN NaN 2015-11-16 15:14:19 +0000 <a href="http://twitter.com/download/iphone" r... Can take selfies 11/10 https://t.co/ws2AMaNwPW NaN NaN NaN https://twitter.com/dog_rates/status/666273097... ... 0.111884 True basenji 0.111152 True 176 76 6997190 134700 2015-11-16 15:14:19
2049 666268910803644416 NaN NaN 2015-11-16 14:57:41 +0000 <a href="http://twitter.com/download/iphone" r... Very concerned about fellow dog trapped in com... NaN NaN NaN https://twitter.com/dog_rates/status/666268910... ... 0.085547 False bookcase 0.079480 False 104 35 6997190 134700 2015-11-16 14:57:41
2050 666104133288665088 NaN NaN 2015-11-16 04:02:55 +0000 <a href="http://twitter.com/download/iphone" r... Not familiar with this breed. No tail (weird).... NaN NaN NaN https://twitter.com/dog_rates/status/666104133... ... 0.033919 False partridge 0.000052 False 14346 6634 6997190 134700 2015-11-16 04:02:55
2051 666102155909144576 NaN NaN 2015-11-16 03:55:04 +0000 <a href="http://twitter.com/download/iphone" r... Oh my. Here you are seeing an Adobe Setter giv... NaN NaN NaN https://twitter.com/dog_rates/status/666102155... ... 0.149842 True borzoi 0.133649 True 80 13 6997190 134700 2015-11-16 03:55:04
2052 666099513787052032 NaN NaN 2015-11-16 03:44:34 +0000 <a href="http://twitter.com/download/iphone" r... Can stand on stump for what seems like a while... NaN NaN NaN https://twitter.com/dog_rates/status/666099513... ... 0.166192 True Dandie_Dinmont 0.089688 True 156 68 6997190 134700 2015-11-16 03:44:34
2053 666094000022159362 NaN NaN 2015-11-16 03:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This appears to be a Mongolian Presbyterian mi... NaN NaN NaN https://twitter.com/dog_rates/status/666094000... ... 0.078260 True malinois 0.075628 True 164 74 6997190 134700 2015-11-16 03:22:39
2054 666082916733198337 NaN NaN 2015-11-16 02:38:37 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a well-established sunblockerspan... NaN NaN NaN https://twitter.com/dog_rates/status/666082916... ... 0.404722 True French_bulldog 0.048960 True 119 45 6997190 134700 2015-11-16 02:38:37
2055 666073100786774016 NaN NaN 2015-11-16 01:59:36 +0000 <a href="http://twitter.com/download/iphone" r... Let's hope this flight isn't Malaysian (lol). ... NaN NaN NaN https://twitter.com/dog_rates/status/666073100... ... 0.175382 True Ibizan_hound 0.097471 True 322 164 6997190 134700 2015-11-16 01:59:36
2056 666071193221509120 NaN NaN 2015-11-16 01:52:02 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a northern speckled Rhododendron.... NaN NaN NaN https://twitter.com/dog_rates/status/666071193... ... 0.174201 True Pekinese 0.109454 True 148 62 6997190 134700 2015-11-16 01:52:02
2057 666063827256086533 NaN NaN 2015-11-16 01:22:45 +0000 <a href="http://twitter.com/download/iphone" r... This is the happiest dog you will ever see. Ve... NaN NaN NaN https://twitter.com/dog_rates/status/666063827... ... 0.093718 True Labrador_retriever 0.072427 True 476 219 6997190 134700 2015-11-16 01:22:45
2058 666058600524156928 NaN NaN 2015-11-16 01:01:59 +0000 <a href="http://twitter.com/download/iphone" r... Here is the Rand Paul of retrievers folks! He'... NaN NaN NaN https://twitter.com/dog_rates/status/666058600... ... 0.192305 True soft-coated_wheaten_terrier 0.082086 True 112 57 6997190 134700 2015-11-16 01:01:59
2059 666057090499244032 NaN NaN 2015-11-16 00:55:59 +0000 <a href="http://twitter.com/download/iphone" r... My oh my. This is a rare blond Canadian terrie... NaN NaN NaN https://twitter.com/dog_rates/status/666057090... ... 0.014594 False golden_retriever 0.007959 True 298 142 6997190 134700 2015-11-16 00:55:59
2060 666055525042405380 NaN NaN 2015-11-16 00:49:46 +0000 <a href="http://twitter.com/download/iphone" r... Here is a Siberian heavily armored polar bear ... NaN NaN NaN https://twitter.com/dog_rates/status/666055525... ... 0.058279 True fur_coat 0.054449 False 434 252 6997190 134700 2015-11-16 00:49:46
2061 666051853826850816 NaN NaN 2015-11-16 00:35:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an odd dog. Hard on the outside but lo... NaN NaN NaN https://twitter.com/dog_rates/status/666051853... ... 0.045885 False terrapin 0.017885 False 1223 853 6997190 134700 2015-11-16 00:35:11
2062 666050758794694657 NaN NaN 2015-11-16 00:30:50 +0000 <a href="http://twitter.com/download/iphone" r... This is a truly beautiful English Wilson Staff... NaN NaN NaN https://twitter.com/dog_rates/status/666050758... ... 0.263788 True Greater_Swiss_Mountain_dog 0.016199 True 132 58 6997190 134700 2015-11-16 00:30:50
2063 666049248165822465 NaN NaN 2015-11-16 00:24:50 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a 1949 1st generation vulpix. Enj... NaN NaN NaN https://twitter.com/dog_rates/status/666049248... ... 0.243682 True Doberman 0.154629 True 109 41 6997190 134700 2015-11-16 00:24:50
2064 666044226329800704 NaN NaN 2015-11-16 00:04:52 +0000 <a href="http://twitter.com/download/iphone" r... This is a purebred Piers Morgan. Loves to Netf... NaN NaN NaN https://twitter.com/dog_rates/status/666044226... ... 0.360687 True miniature_pinscher 0.222752 True 299 141 6997190 134700 2015-11-16 00:04:52
2065 666033412701032449 NaN NaN 2015-11-15 23:21:54 +0000 <a href="http://twitter.com/download/iphone" r... Here is a very happy pup. Big fan of well-main... NaN NaN NaN https://twitter.com/dog_rates/status/666033412... ... 0.138584 True bloodhound 0.116197 True 125 44 6997190 134700 2015-11-15 23:21:54
2066 666029285002620928 NaN NaN 2015-11-15 23:05:30 +0000 <a href="http://twitter.com/download/iphone" r... This is a western brown Mitsubishi terrier. Up... NaN NaN NaN https://twitter.com/dog_rates/status/666029285... ... 0.074192 True Rhodesian_ridgeback 0.072010 True 129 47 6997190 134700 2015-11-15 23:05:30
2067 666020888022790149 NaN NaN 2015-11-15 22:32:08 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a Japanese Irish Setter. Lost eye... NaN NaN NaN https://twitter.com/dog_rates/status/666020888... ... 0.156665 True Shetland_sheepdog 0.061428 True 2560 517 6997190 134700 2015-11-15 22:32:08

1993 rows × 33 columns

Define

quality issue #3 Delete useless columns as these contains most of the values as NaN

In [86]:
#code
df_master = df_master.drop('in_reply_to_status_id', 1)

df_master = df_master.drop('in_reply_to_user_id', 1)

df_master = df_master.drop('retweeted_status_id', 1)

df_master = df_master.drop('retweeted_status_timestamp', 1)
In [87]:
#Test
df_master.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1993 entries, 0 to 2067
Data columns (total 29 columns):
tweet_id                    1993 non-null int64
timestamp                   1993 non-null object
source                      1993 non-null object
text                        1993 non-null object
retweeted_status_user_id    0 non-null float64
expanded_urls               1993 non-null object
rating_numerator            1993 non-null int64
rating_denominator          1993 non-null int64
name                        1993 non-null object
doggo                       1993 non-null object
floofer                     1993 non-null object
pupper                      1993 non-null object
puppo                       1993 non-null object
jpg_url                     1993 non-null object
img_num                     1993 non-null int64
p1                          1993 non-null object
p1_conf                     1993 non-null float64
p1_dog                      1993 non-null bool
p2                          1993 non-null object
p2_conf                     1993 non-null float64
p2_dog                      1993 non-null bool
p3                          1993 non-null object
p3_conf                     1993 non-null float64
p3_dog                      1993 non-null bool
favorites                   1993 non-null int64
retweets                    1993 non-null int64
user_followers              1993 non-null int64
user_favourites             1993 non-null int64
date_time                   1993 non-null object
dtypes: bool(3), float64(4), int64(8), object(14)
memory usage: 426.2+ KB

Define

Data quality issue#4 Delete date_time as it's duplicate , same value coming from 2 data sources.

In [88]:
#code
df_master = df_master.drop('date_time', 1)
In [89]:
#test
df_master.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1993 entries, 0 to 2067
Data columns (total 28 columns):
tweet_id                    1993 non-null int64
timestamp                   1993 non-null object
source                      1993 non-null object
text                        1993 non-null object
retweeted_status_user_id    0 non-null float64
expanded_urls               1993 non-null object
rating_numerator            1993 non-null int64
rating_denominator          1993 non-null int64
name                        1993 non-null object
doggo                       1993 non-null object
floofer                     1993 non-null object
pupper                      1993 non-null object
puppo                       1993 non-null object
jpg_url                     1993 non-null object
img_num                     1993 non-null int64
p1                          1993 non-null object
p1_conf                     1993 non-null float64
p1_dog                      1993 non-null bool
p2                          1993 non-null object
p2_conf                     1993 non-null float64
p2_dog                      1993 non-null bool
p3                          1993 non-null object
p3_conf                     1993 non-null float64
p3_dog                      1993 non-null bool
favorites                   1993 non-null int64
retweets                    1993 non-null int64
user_followers              1993 non-null int64
user_favourites             1993 non-null int64
dtypes: bool(3), float64(4), int64(8), object(13)
memory usage: 410.7+ KB

define

data quality issue #5

for all the denominator < 10, replace with value 10 as denominator should not be less than 10

In [90]:
#code
mask = df_master.rating_denominator < 10
column_name = 'rating_denominator'
df_master.loc[mask, column_name] = 10
In [91]:
#Test
#a value of zero shows , cleaning step is successful.
df_master.loc[df_master['rating_denominator'] < 10]
Out[91]:
tweet_id timestamp source text retweeted_status_user_id expanded_urls rating_numerator rating_denominator name doggo ... p2 p2_conf p2_dog p3 p3_conf p3_dog favorites retweets user_followers user_favourites

0 rows × 28 columns

define

data quality issue #6

In name field , a number of values are invalid. A proper dog name is in uppercase. Filter out all lowercase names.

In [92]:
#Code
mask = (df_master.name.str.islower() == True)
column_name = 'name'
df_master.loc[mask, column_name] = 'None'
In [93]:
#test
#eye ball check to confirm that invalid values such as   'a' ,'an' , 'the' are replaced with 'None'
df_master
Out[93]:
tweet_id timestamp source text retweeted_status_user_id expanded_urls rating_numerator rating_denominator name doggo ... p2 p2_conf p2_dog p3 p3_conf p3_dog favorites retweets user_followers user_favourites
0 892420643555336193 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN https://twitter.com/dog_rates/status/892420643... 13 10 Phineas None ... bagel 0.085851 False banana 0.076110 False 38680 8559 6997101 134700
1 892177421306343426 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN https://twitter.com/dog_rates/status/892177421... 13 10 Tilly None ... Pekinese 0.090647 True papillon 0.068957 True 33156 6291 6997101 134700
2 891815181378084864 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN https://twitter.com/dog_rates/status/891815181... 12 10 Archie None ... malamute 0.078253 True kelpie 0.031379 True 24958 4170 6997101 134700
3 891689557279858688 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN https://twitter.com/dog_rates/status/891689557... 13 10 Darla None ... Labrador_retriever 0.168086 True spatula 0.040836 False 42064 8687 6997101 134700
4 891327558926688256 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN https://twitter.com/dog_rates/status/891327558... 12 10 Franklin None ... English_springer 0.225770 True German_short-haired_pointer 0.175219 True 40212 9447 6997101 134700
5 891087950875897856 2017-07-29 00:08:17 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a majestic great white breaching ... NaN https://twitter.com/dog_rates/status/891087950... 13 10 None None ... Irish_terrier 0.116317 True Indian_elephant 0.076902 False 20163 3127 6997101 134700
6 890971913173991426 2017-07-28 16:27:12 +0000 <a href="http://twitter.com/download/iphone" r... Meet Jax. He enjoys ice cream so much he gets ... NaN https://gofundme.com/ydvmve-surgery-for-jax,ht... 13 10 Jax None ... Border_collie 0.199287 True ice_lolly 0.193548 False 11816 2082 6997101 134700
7 890729181411237888 2017-07-28 00:22:40 +0000 <a href="http://twitter.com/download/iphone" r... When you watch your owner call another dog a g... NaN https://twitter.com/dog_rates/status/890729181... 13 10 None None ... Eskimo_dog 0.178406 True Pembroke 0.076507 True 65338 18971 6997101 134700
8 890609185150312448 2017-07-27 16:25:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Zoey. She doesn't want to be one of th... NaN https://twitter.com/dog_rates/status/890609185... 13 10 Zoey None ... Irish_setter 0.193054 True Chesapeake_Bay_retriever 0.118184 True 27713 4279 6997101 134700
9 890240255349198849 2017-07-26 15:59:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Cassie. She is a college pup. Studying... NaN https://twitter.com/dog_rates/status/890240255... 14 10 Cassie doggo ... Cardigan 0.451038 True Chihuahua 0.029248 True 31857 7448 6997101 134700
10 890006608113172480 2017-07-26 00:31:25 +0000 <a href="http://twitter.com/download/iphone" r... This is Koda. He is a South Australian decksha... NaN https://twitter.com/dog_rates/status/890006608... 13 10 Koda None ... Pomeranian 0.013884 True chow 0.008167 True 30583 7363 6997101 134700
11 889880896479866881 2017-07-25 16:11:53 +0000 <a href="http://twitter.com/download/iphone" r... This is Bruno. He is a service shark. Only get... NaN https://twitter.com/dog_rates/status/889880896... 13 10 Bruno None ... Labrador_retriever 0.151317 True muzzle 0.082981 False 27719 4989 6997101 134700
12 889665388333682689 2017-07-25 01:55:32 +0000 <a href="http://twitter.com/download/iphone" r... Here's a puppo that seems to be on the fence a... NaN https://twitter.com/dog_rates/status/889665388... 13 10 None None ... Cardigan 0.027356 True basenji 0.004633 True 48003 10101 6997101 134700
13 889638837579907072 2017-07-25 00:10:02 +0000 <a href="http://twitter.com/download/iphone" r... This is Ted. He does his best. Sometimes that'... NaN https://twitter.com/dog_rates/status/889638837... 12 10 Ted None ... boxer 0.002129 True Staffordshire_bullterrier 0.001498 True 27103 4564 6997101 134700
14 889531135344209921 2017-07-24 17:02:04 +0000 <a href="http://twitter.com/download/iphone" r... This is Stuart. He's sporting his favorite fan... NaN https://twitter.com/dog_rates/status/889531135... 13 10 Stuart None ... Labrador_retriever 0.013834 True redbone 0.007958 True 15052 2244 6997101 134700
15 889278841981685760 2017-07-24 00:19:32 +0000 <a href="http://twitter.com/download/iphone" r... This is Oliver. You're witnessing one of his m... NaN https://twitter.com/dog_rates/status/889278841... 13 10 Oliver None ... borzoi 0.194742 True Saluki 0.027351 True 25236 5446 6997101 134700
16 888917238123831296 2017-07-23 00:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This is Jim. He found a fren. Taught him how t... NaN https://twitter.com/dog_rates/status/888917238... 12 10 Jim None ... Tibetan_mastiff 0.120184 True Labrador_retriever 0.105506 True 28996 4517 6997101 134700
17 888804989199671297 2017-07-22 16:56:37 +0000 <a href="http://twitter.com/download/iphone" r... This is Zeke. He has a new stick. Very proud o... NaN https://twitter.com/dog_rates/status/888804989... 13 10 Zeke None ... Labrador_retriever 0.184172 True English_setter 0.073482 True 25527 4363 6997101 134700
18 888554962724278272 2017-07-22 00:23:06 +0000 <a href="http://twitter.com/download/iphone" r... This is Ralphus. He's powering up. Attempting ... NaN https://twitter.com/dog_rates/status/888554962... 13 10 Ralphus None ... Eskimo_dog 0.166511 True malamute 0.111411 True 19850 3596 6997101 134700
19 888078434458587136 2017-07-20 16:49:33 +0000 <a href="http://twitter.com/download/iphone" r... This is Gerald. He was just told he didn't get... NaN https://twitter.com/dog_rates/status/888078434... 12 10 Gerald None ... pug 0.000932 True bull_mastiff 0.000903 True 21702 3511 6997101 134700
20 887705289381826560 2017-07-19 16:06:48 +0000 <a href="http://twitter.com/download/iphone" r... This is Jeffrey. He has a monopoly on the pool... NaN https://twitter.com/dog_rates/status/887705289... 13 10 Jeffrey None ... redbone 0.087582 True Weimaraner 0.026236 True 30108 5414 6997101 134700
21 887517139158093824 2017-07-19 03:39:09 +0000 <a href="http://twitter.com/download/iphone" r... I've yet to rate a Venezuelan Hover Wiener. Th... NaN https://twitter.com/dog_rates/status/887517139... 14 10 None None ... tow_truck 0.029175 False shopping_cart 0.026321 False 46119 11713 6997101 134700
22 887473957103951883 2017-07-19 00:47:34 +0000 <a href="http://twitter.com/download/iphone" r... This is Canela. She attempted some fancy porch... NaN https://twitter.com/dog_rates/status/887473957... 13 10 Canela None ... Rhodesian_ridgeback 0.054950 True beagle 0.038915 True 68935 18305 6997101 134700
23 887343217045368832 2017-07-18 16:08:03 +0000 <a href="http://twitter.com/download/iphone" r... You may not have known you needed to see this ... NaN https://twitter.com/dog_rates/status/887343217... 13 10 None None ... sea_lion 0.275645 False Weimaraner 0.134203 True 33582 10446 6997101 134700
24 887101392804085760 2017-07-18 00:07:08 +0000 <a href="http://twitter.com/download/iphone" r... This... is a Jubilant Antarctic House Bear. We... NaN https://twitter.com/dog_rates/status/887101392... 12 10 None None ... Eskimo_dog 0.035029 True Staffordshire_bullterrier 0.029705 True 30451 5984 6997101 134700
25 886983233522544640 2017-07-17 16:17:36 +0000 <a href="http://twitter.com/download/iphone" r... This is Maya. She's very shy. Rarely leaves he... NaN https://twitter.com/dog_rates/status/886983233... 13 10 Maya None ... toy_terrier 0.143528 True can_opener 0.032253 False 35058 7806 6997101 134700
26 886736880519319552 2017-07-16 23:58:41 +0000 <a href="http://twitter.com/download/iphone" r... This is Mingus. He's a wonderful father to his... NaN https://www.gofundme.com/mingusneedsus,https:/... 13 10 Mingus None ... Great_Pyrenees 0.186136 True Dandie_Dinmont 0.086346 True 12037 3311 6997101 134700
27 886680336477933568 2017-07-16 20:14:00 +0000 <a href="http://twitter.com/download/iphone" r... This is Derek. He's late for a dog meeting. 13... NaN https://twitter.com/dog_rates/status/886680336... 13 10 Derek None ... sports_car 0.139952 False car_wheel 0.044173 False 22362 4486 6997101 134700
28 886366144734445568 2017-07-15 23:25:31 +0000 <a href="http://twitter.com/download/iphone" r... This is Roscoe. Another pupper fallen victim t... NaN https://twitter.com/dog_rates/status/886366144... 12 10 Roscoe None ... Chihuahua 0.000361 True Boston_bull 0.000076 True 21142 3208 6997102 134700
29 886258384151887873 2017-07-15 16:17:19 +0000 <a href="http://twitter.com/download/iphone" r... This is Waffles. His doggles are pupside down.... NaN https://twitter.com/dog_rates/status/886258384... 13 10 Waffles None ... shower_cap 0.025286 False Siamese_cat 0.002849 False 27905 6316 6997102 134700
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2038 666411507551481857 2015-11-17 00:24:19 +0000 <a href="http://twitter.com/download/iphone" r... This is quite the dog. Gets really excited whe... NaN https://twitter.com/dog_rates/status/666411507... 2 10 None None ... barracouta 0.271485 False gar 0.189945 False 447 328 6997189 134700
2039 666407126856765440 2015-11-17 00:06:54 +0000 <a href="http://twitter.com/download/iphone" r... This is a southern Vesuvius bumblegruff. Can d... NaN https://twitter.com/dog_rates/status/666407126... 7 10 None None ... bloodhound 0.244220 True flat-coated_retriever 0.173810 True 110 41 6997189 134700
2040 666396247373291520 2015-11-16 23:23:41 +0000 <a href="http://twitter.com/download/iphone" r... Oh goodness. A super rare northeast Qdoba kang... NaN https://twitter.com/dog_rates/status/666396247... 9 10 None None ... toy_terrier 0.009397 True papillon 0.004577 True 167 86 6997189 134700
2041 666373753744588802 2015-11-16 21:54:18 +0000 <a href="http://twitter.com/download/iphone" r... Those are sunglasses and a jean jacket. 11/10 ... NaN https://twitter.com/dog_rates/status/666373753... 11 10 None None ... Afghan_hound 0.259551 True briard 0.206803 True 190 93 6997189 134700
2042 666362758909284353 2015-11-16 21:10:36 +0000 <a href="http://twitter.com/download/iphone" r... Unique dog here. Very small. Lives in containe... NaN https://twitter.com/dog_rates/status/666362758... 6 10 None None ... skunk 0.002402 False hamster 0.000461 False 778 574 6997189 134700
2043 666353288456101888 2015-11-16 20:32:58 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a mixed Asiago from the GalƔpagos... NaN https://twitter.com/dog_rates/status/666353288... 8 10 None None ... Siberian_husky 0.147655 True Eskimo_dog 0.093412 True 221 73 6997190 134700
2044 666345417576210432 2015-11-16 20:01:42 +0000 <a href="http://twitter.com/download/iphone" r... Look at this jokester thinking seat belt laws ... NaN https://twitter.com/dog_rates/status/666345417... 10 10 None None ... Chesapeake_Bay_retriever 0.054787 True Labrador_retriever 0.014241 True 299 139 6997190 134700
2045 666337882303524864 2015-11-16 19:31:45 +0000 <a href="http://twitter.com/download/iphone" r... This is an extremely rare horned Parthenon. No... NaN https://twitter.com/dog_rates/status/666337882... 9 10 None None ... Newfoundland 0.278407 True groenendael 0.102643 True 198 92 6997190 134700
2046 666293911632134144 2015-11-16 16:37:02 +0000 <a href="http://twitter.com/download/iphone" r... This is a funny dog. Weird toes. Won't come do... NaN https://twitter.com/dog_rates/status/666293911... 3 10 None None ... otter 0.015250 False great_grey_owl 0.013207 False 509 357 6997190 134700
2047 666287406224695296 2015-11-16 16:11:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an Albanian 3 1/2 legged Episcopalian... NaN https://twitter.com/dog_rates/status/666287406... 1 10 None None ... toy_poodle 0.063064 True miniature_poodle 0.025581 True 149 66 6997190 134700
2048 666273097616637952 2015-11-16 15:14:19 +0000 <a href="http://twitter.com/download/iphone" r... Can take selfies 11/10 https://t.co/ws2AMaNwPW NaN https://twitter.com/dog_rates/status/666273097... 11 10 None None ... toy_terrier 0.111884 True basenji 0.111152 True 176 76 6997190 134700
2049 666268910803644416 2015-11-16 14:57:41 +0000 <a href="http://twitter.com/download/iphone" r... Very concerned about fellow dog trapped in com... NaN https://twitter.com/dog_rates/status/666268910... 10 10 None None ... desk 0.085547 False bookcase 0.079480 False 104 35 6997190 134700
2050 666104133288665088 2015-11-16 04:02:55 +0000 <a href="http://twitter.com/download/iphone" r... Not familiar with this breed. No tail (weird).... NaN https://twitter.com/dog_rates/status/666104133... 1 10 None None ... cock 0.033919 False partridge 0.000052 False 14346 6634 6997190 134700
2051 666102155909144576 2015-11-16 03:55:04 +0000 <a href="http://twitter.com/download/iphone" r... Oh my. Here you are seeing an Adobe Setter giv... NaN https://twitter.com/dog_rates/status/666102155... 11 10 None None ... Newfoundland 0.149842 True borzoi 0.133649 True 80 13 6997190 134700
2052 666099513787052032 2015-11-16 03:44:34 +0000 <a href="http://twitter.com/download/iphone" r... Can stand on stump for what seems like a while... NaN https://twitter.com/dog_rates/status/666099513... 8 10 None None ... Shih-Tzu 0.166192 True Dandie_Dinmont 0.089688 True 156 68 6997190 134700
2053 666094000022159362 2015-11-16 03:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This appears to be a Mongolian Presbyterian mi... NaN https://twitter.com/dog_rates/status/666094000... 9 10 None None ... German_shepherd 0.078260 True malinois 0.075628 True 164 74 6997190 134700
2054 666082916733198337 2015-11-16 02:38:37 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a well-established sunblockerspan... NaN https://twitter.com/dog_rates/status/666082916... 6 10 None None ... bull_mastiff 0.404722 True French_bulldog 0.048960 True 119 45 6997190 134700
2055 666073100786774016 2015-11-16 01:59:36 +0000 <a href="http://twitter.com/download/iphone" r... Let's hope this flight isn't Malaysian (lol). ... NaN https://twitter.com/dog_rates/status/666073100... 10 10 None None ... English_foxhound 0.175382 True Ibizan_hound 0.097471 True 322 164 6997190 134700
2056 666071193221509120 2015-11-16 01:52:02 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a northern speckled Rhododendron.... NaN https://twitter.com/dog_rates/status/666071193... 9 10 None None ... Yorkshire_terrier 0.174201 True Pekinese 0.109454 True 148 62 6997190 134700
2057 666063827256086533 2015-11-16 01:22:45 +0000 <a href="http://twitter.com/download/iphone" r... This is the happiest dog you will ever see. Ve... NaN https://twitter.com/dog_rates/status/666063827... 10 10 None None ... Tibetan_mastiff 0.093718 True Labrador_retriever 0.072427 True 476 219 6997190 134700
2058 666058600524156928 2015-11-16 01:01:59 +0000 <a href="http://twitter.com/download/iphone" r... Here is the Rand Paul of retrievers folks! He'... NaN https://twitter.com/dog_rates/status/666058600... 8 10 None None ... komondor 0.192305 True soft-coated_wheaten_terrier 0.082086 True 112 57 6997190 134700
2059 666057090499244032 2015-11-16 00:55:59 +0000 <a href="http://twitter.com/download/iphone" r... My oh my. This is a rare blond Canadian terrie... NaN https://twitter.com/dog_rates/status/666057090... 9 10 None None ... shopping_basket 0.014594 False golden_retriever 0.007959 True 298 142 6997190 134700
2060 666055525042405380 2015-11-16 00:49:46 +0000 <a href="http://twitter.com/download/iphone" r... Here is a Siberian heavily armored polar bear ... NaN https://twitter.com/dog_rates/status/666055525... 10 10 None None ... Tibetan_mastiff 0.058279 True fur_coat 0.054449 False 434 252 6997190 134700
2061 666051853826850816 2015-11-16 00:35:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an odd dog. Hard on the outside but lo... NaN https://twitter.com/dog_rates/status/666051853... 2 10 None None ... mud_turtle 0.045885 False terrapin 0.017885 False 1223 853 6997190 134700
2062 666050758794694657 2015-11-16 00:30:50 +0000 <a href="http://twitter.com/download/iphone" r... This is a truly beautiful English Wilson Staff... NaN https://twitter.com/dog_rates/status/666050758... 10 10 None None ... English_springer 0.263788 True Greater_Swiss_Mountain_dog 0.016199 True 132 58 6997190 134700
2063 666049248165822465 2015-11-16 00:24:50 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a 1949 1st generation vulpix. Enj... NaN https://twitter.com/dog_rates/status/666049248... 5 10 None None ... Rottweiler 0.243682 True Doberman 0.154629 True 109 41 6997190 134700
2064 666044226329800704 2015-11-16 00:04:52 +0000 <a href="http://twitter.com/download/iphone" r... This is a purebred Piers Morgan. Loves to Netf... NaN https://twitter.com/dog_rates/status/666044226... 6 10 None None ... redbone 0.360687 True miniature_pinscher 0.222752 True 299 141 6997190 134700
2065 666033412701032449 2015-11-15 23:21:54 +0000 <a href="http://twitter.com/download/iphone" r... Here is a very happy pup. Big fan of well-main... NaN https://twitter.com/dog_rates/status/666033412... 9 10 None None ... malinois 0.138584 True bloodhound 0.116197 True 125 44 6997190 134700
2066 666029285002620928 2015-11-15 23:05:30 +0000 <a href="http://twitter.com/download/iphone" r... This is a western brown Mitsubishi terrier. Up... NaN https://twitter.com/dog_rates/status/666029285... 7 10 None None ... miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True 129 47 6997190 134700
2067 666020888022790149 2015-11-15 22:32:08 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a Japanese Irish Setter. Lost eye... NaN https://twitter.com/dog_rates/status/666020888... 8 10 None None ... collie 0.156665 True Shetland_sheepdog 0.061428 True 2560 517 6997190 134700

1993 rows × 28 columns

Define

data quality issue #7

change data type of retweeted_id from float to object type. Keep tweet_id as int to facilitate visualization on it as plot axis.

In [94]:
#code
df_master['retweeted_status_user_id'] = df_master['retweeted_status_user_id'].astype(str)
In [95]:
#Test
df_master.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1993 entries, 0 to 2067
Data columns (total 28 columns):
tweet_id                    1993 non-null int64
timestamp                   1993 non-null object
source                      1993 non-null object
text                        1993 non-null object
retweeted_status_user_id    1993 non-null object
expanded_urls               1993 non-null object
rating_numerator            1993 non-null int64
rating_denominator          1993 non-null int64
name                        1993 non-null object
doggo                       1993 non-null object
floofer                     1993 non-null object
pupper                      1993 non-null object
puppo                       1993 non-null object
jpg_url                     1993 non-null object
img_num                     1993 non-null int64
p1                          1993 non-null object
p1_conf                     1993 non-null float64
p1_dog                      1993 non-null bool
p2                          1993 non-null object
p2_conf                     1993 non-null float64
p2_dog                      1993 non-null bool
p3                          1993 non-null object
p3_conf                     1993 non-null float64
p3_dog                      1993 non-null bool
favorites                   1993 non-null int64
retweets                    1993 non-null int64
user_followers              1993 non-null int64
user_favourites             1993 non-null int64
dtypes: bool(3), float64(3), int64(8), object(14)
memory usage: 410.7+ KB

define

quality issue #8

rename columns to give them a sensible names.

In [96]:
#Code
df_master = df_master.rename(columns = {'p1': 'category1', 'p2': 'category2', 'p3': 'category3', 
                                        'p1_conf': 'cat1_conf', 'p2_conf': 'cat2_conf', 
                                        'p3_conf': 'cat3_conf', 'p1_dog': 'cat1_dog',
                                        'p2_dog': 'cat2_dog', 'p3_dog': 'cat3_dog'})
In [97]:
#Test
df_master.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1993 entries, 0 to 2067
Data columns (total 28 columns):
tweet_id                    1993 non-null int64
timestamp                   1993 non-null object
source                      1993 non-null object
text                        1993 non-null object
retweeted_status_user_id    1993 non-null object
expanded_urls               1993 non-null object
rating_numerator            1993 non-null int64
rating_denominator          1993 non-null int64
name                        1993 non-null object
doggo                       1993 non-null object
floofer                     1993 non-null object
pupper                      1993 non-null object
puppo                       1993 non-null object
jpg_url                     1993 non-null object
img_num                     1993 non-null int64
category1                   1993 non-null object
cat1_conf                   1993 non-null float64
cat1_dog                    1993 non-null bool
category2                   1993 non-null object
cat2_conf                   1993 non-null float64
cat2_dog                    1993 non-null bool
category3                   1993 non-null object
cat3_conf                   1993 non-null float64
cat3_dog                    1993 non-null bool
favorites                   1993 non-null int64
retweets                    1993 non-null int64
user_followers              1993 non-null int64
user_favourites             1993 non-null int64
dtypes: bool(3), float64(3), int64(8), object(14)
memory usage: 410.7+ KB

define

quality issue#9

As denominator is always 10 ,there is not point in keeping this column. We can rename numerator column insteasd and rename it to rating_out_of_10

In [98]:
#code

df_master = df_master.drop('rating_denominator', 1)

df_master = df_master.rename(columns = {'rating_numerator': 'rating_out_of_10'})
In [99]:
#test
df_master.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1993 entries, 0 to 2067
Data columns (total 27 columns):
tweet_id                    1993 non-null int64
timestamp                   1993 non-null object
source                      1993 non-null object
text                        1993 non-null object
retweeted_status_user_id    1993 non-null object
expanded_urls               1993 non-null object
rating_out_of_10            1993 non-null int64
name                        1993 non-null object
doggo                       1993 non-null object
floofer                     1993 non-null object
pupper                      1993 non-null object
puppo                       1993 non-null object
jpg_url                     1993 non-null object
img_num                     1993 non-null int64
category1                   1993 non-null object
cat1_conf                   1993 non-null float64
cat1_dog                    1993 non-null bool
category2                   1993 non-null object
cat2_conf                   1993 non-null float64
cat2_dog                    1993 non-null bool
category3                   1993 non-null object
cat3_conf                   1993 non-null float64
cat3_dog                    1993 non-null bool
favorites                   1993 non-null int64
retweets                    1993 non-null int64
user_followers              1993 non-null int64
user_favourites             1993 non-null int64
dtypes: bool(3), float64(3), int64(7), object(14)
memory usage: 395.1+ KB

Define

tidyness issue #2

doggo, floffer, pupper, puppo are nothing but dog stages. Merge these columns into a new column dog_stage.

In [100]:
#code

mask = (df_master.doggo == 'None')
column_name = 'doggo'
df_master.loc[mask, column_name] = ' '

mask = (df_master.floofer == 'None')
column_name = 'floofer'
df_master.loc[mask, column_name] = ' '

mask = (df_master.pupper == 'None')
column_name = 'pupper'
df_master.loc[mask, column_name] = ' '

mask = (df_master.puppo == 'None')
column_name = 'puppo'
df_master.loc[mask, column_name] = ' '


#merge these columns and create a column named dog_stage
df_master['dog_stage'] = df_master['doggo'] + df_master['floofer'] + df_master['pupper'] + df_master['puppo']

#delete redundant columns doggo, floofer, pupper, and puppo
df_master = df_master.drop('doggo', 1)
df_master = df_master.drop('floofer', 1)
df_master = df_master.drop('pupper', 1)
df_master = df_master.drop('puppo', 1)
In [101]:
#test 
#check dataframe info for above changes...
df_master.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1993 entries, 0 to 2067
Data columns (total 24 columns):
tweet_id                    1993 non-null int64
timestamp                   1993 non-null object
source                      1993 non-null object
text                        1993 non-null object
retweeted_status_user_id    1993 non-null object
expanded_urls               1993 non-null object
rating_out_of_10            1993 non-null int64
name                        1993 non-null object
jpg_url                     1993 non-null object
img_num                     1993 non-null int64
category1                   1993 non-null object
cat1_conf                   1993 non-null float64
cat1_dog                    1993 non-null bool
category2                   1993 non-null object
cat2_conf                   1993 non-null float64
cat2_dog                    1993 non-null bool
category3                   1993 non-null object
cat3_conf                   1993 non-null float64
cat3_dog                    1993 non-null bool
favorites                   1993 non-null int64
retweets                    1993 non-null int64
user_followers              1993 non-null int64
user_favourites             1993 non-null int64
dog_stage                   1993 non-null object
dtypes: bool(3), float64(3), int64(7), object(11)
memory usage: 348.4+ KB
In [102]:
# Test
list(df_master)
Out[102]:
['tweet_id',
 'timestamp',
 'source',
 'text',
 'retweeted_status_user_id',
 'expanded_urls',
 'rating_out_of_10',
 'name',
 'jpg_url',
 'img_num',
 'category1',
 'cat1_conf',
 'cat1_dog',
 'category2',
 'cat2_conf',
 'cat2_dog',
 'category3',
 'cat3_conf',
 'cat3_dog',
 'favorites',
 'retweets',
 'user_followers',
 'user_favourites',
 'dog_stage']
In [103]:
# Store the clean DataFrame in a CSV file
df_master.to_csv('twitter_archive_master.csv', index=False, encoding = 'utf-8')
In [104]:
df_master = pd.read_csv('twitter_archive_master.csv')
df_master.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1993 entries, 0 to 1992
Data columns (total 24 columns):
tweet_id                    1993 non-null int64
timestamp                   1993 non-null object
source                      1993 non-null object
text                        1993 non-null object
retweeted_status_user_id    0 non-null float64
expanded_urls               1993 non-null object
rating_out_of_10            1993 non-null int64
name                        1993 non-null object
jpg_url                     1993 non-null object
img_num                     1993 non-null int64
category1                   1993 non-null object
cat1_conf                   1993 non-null float64
cat1_dog                    1993 non-null bool
category2                   1993 non-null object
cat2_conf                   1993 non-null float64
cat2_dog                    1993 non-null bool
category3                   1993 non-null object
cat3_conf                   1993 non-null float64
cat3_dog                    1993 non-null bool
favorites                   1993 non-null int64
retweets                    1993 non-null int64
user_followers              1993 non-null int64
user_favourites             1993 non-null int64
dog_stage                   1993 non-null object
dtypes: bool(3), float64(4), int64(7), object(10)
memory usage: 332.9+ KB

Data cleaing Section ends here.....

visualization section begins here....

In [105]:
# Import the clean dataset into dataframe
df_master = pd.read_csv('twitter_archive_master.csv')
df_master.info()
df_master
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1993 entries, 0 to 1992
Data columns (total 24 columns):
tweet_id                    1993 non-null int64
timestamp                   1993 non-null object
source                      1993 non-null object
text                        1993 non-null object
retweeted_status_user_id    0 non-null float64
expanded_urls               1993 non-null object
rating_out_of_10            1993 non-null int64
name                        1993 non-null object
jpg_url                     1993 non-null object
img_num                     1993 non-null int64
category1                   1993 non-null object
cat1_conf                   1993 non-null float64
cat1_dog                    1993 non-null bool
category2                   1993 non-null object
cat2_conf                   1993 non-null float64
cat2_dog                    1993 non-null bool
category3                   1993 non-null object
cat3_conf                   1993 non-null float64
cat3_dog                    1993 non-null bool
favorites                   1993 non-null int64
retweets                    1993 non-null int64
user_followers              1993 non-null int64
user_favourites             1993 non-null int64
dog_stage                   1993 non-null object
dtypes: bool(3), float64(4), int64(7), object(10)
memory usage: 332.9+ KB
Out[105]:
tweet_id timestamp source text retweeted_status_user_id expanded_urls rating_out_of_10 name jpg_url img_num ... cat2_conf cat2_dog category3 cat3_conf cat3_dog favorites retweets user_followers user_favourites dog_stage
0 892420643555336193 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN https://twitter.com/dog_rates/status/892420643... 13 Phineas https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1 ... 0.085851 False banana 0.076110 False 38680 8559 6997101 134700
1 892177421306343426 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN https://twitter.com/dog_rates/status/892177421... 13 Tilly https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1 ... 0.090647 True papillon 0.068957 True 33156 6291 6997101 134700
2 891815181378084864 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN https://twitter.com/dog_rates/status/891815181... 12 Archie https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1 ... 0.078253 True kelpie 0.031379 True 24958 4170 6997101 134700
3 891689557279858688 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN https://twitter.com/dog_rates/status/891689557... 13 Darla https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1 ... 0.168086 True spatula 0.040836 False 42064 8687 6997101 134700
4 891327558926688256 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN https://twitter.com/dog_rates/status/891327558... 12 Franklin https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2 ... 0.225770 True German_short-haired_pointer 0.175219 True 40212 9447 6997101 134700
5 891087950875897856 2017-07-29 00:08:17 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a majestic great white breaching ... NaN https://twitter.com/dog_rates/status/891087950... 13 None https://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg 1 ... 0.116317 True Indian_elephant 0.076902 False 20163 3127 6997101 134700
6 890971913173991426 2017-07-28 16:27:12 +0000 <a href="http://twitter.com/download/iphone" r... Meet Jax. He enjoys ice cream so much he gets ... NaN https://gofundme.com/ydvmve-surgery-for-jax,ht... 13 Jax https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg 1 ... 0.199287 True ice_lolly 0.193548 False 11816 2082 6997101 134700
7 890729181411237888 2017-07-28 00:22:40 +0000 <a href="http://twitter.com/download/iphone" r... When you watch your owner call another dog a g... NaN https://twitter.com/dog_rates/status/890729181... 13 None https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg 2 ... 0.178406 True Pembroke 0.076507 True 65338 18971 6997101 134700
8 890609185150312448 2017-07-27 16:25:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Zoey. She doesn't want to be one of th... NaN https://twitter.com/dog_rates/status/890609185... 13 Zoey https://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg 1 ... 0.193054 True Chesapeake_Bay_retriever 0.118184 True 27713 4279 6997101 134700
9 890240255349198849 2017-07-26 15:59:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Cassie. She is a college pup. Studying... NaN https://twitter.com/dog_rates/status/890240255... 14 Cassie https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg 1 ... 0.451038 True Chihuahua 0.029248 True 31857 7448 6997101 134700 doggo
10 890006608113172480 2017-07-26 00:31:25 +0000 <a href="http://twitter.com/download/iphone" r... This is Koda. He is a South Australian decksha... NaN https://twitter.com/dog_rates/status/890006608... 13 Koda https://pbs.twimg.com/media/DFnwSY4WAAAMliS.jpg 1 ... 0.013884 True chow 0.008167 True 30583 7363 6997101 134700
11 889880896479866881 2017-07-25 16:11:53 +0000 <a href="http://twitter.com/download/iphone" r... This is Bruno. He is a service shark. Only get... NaN https://twitter.com/dog_rates/status/889880896... 13 Bruno https://pbs.twimg.com/media/DFl99B1WsAITKsg.jpg 1 ... 0.151317 True muzzle 0.082981 False 27719 4989 6997101 134700
12 889665388333682689 2017-07-25 01:55:32 +0000 <a href="http://twitter.com/download/iphone" r... Here's a puppo that seems to be on the fence a... NaN https://twitter.com/dog_rates/status/889665388... 13 None https://pbs.twimg.com/media/DFi579UWsAAatzw.jpg 1 ... 0.027356 True basenji 0.004633 True 48003 10101 6997101 134700 puppo
13 889638837579907072 2017-07-25 00:10:02 +0000 <a href="http://twitter.com/download/iphone" r... This is Ted. He does his best. Sometimes that'... NaN https://twitter.com/dog_rates/status/889638837... 12 Ted https://pbs.twimg.com/media/DFihzFfXsAYGDPR.jpg 1 ... 0.002129 True Staffordshire_bullterrier 0.001498 True 27103 4564 6997101 134700
14 889531135344209921 2017-07-24 17:02:04 +0000 <a href="http://twitter.com/download/iphone" r... This is Stuart. He's sporting his favorite fan... NaN https://twitter.com/dog_rates/status/889531135... 13 Stuart https://pbs.twimg.com/media/DFg_2PVW0AEHN3p.jpg 1 ... 0.013834 True redbone 0.007958 True 15052 2244 6997101 134700 puppo
15 889278841981685760 2017-07-24 00:19:32 +0000 <a href="http://twitter.com/download/iphone" r... This is Oliver. You're witnessing one of his m... NaN https://twitter.com/dog_rates/status/889278841... 13 Oliver https://pbs.twimg.com/ext_tw_video_thumb/88927... 1 ... 0.194742 True Saluki 0.027351 True 25236 5446 6997101 134700
16 888917238123831296 2017-07-23 00:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This is Jim. He found a fren. Taught him how t... NaN https://twitter.com/dog_rates/status/888917238... 12 Jim https://pbs.twimg.com/media/DFYRgsOUQAARGhO.jpg 1 ... 0.120184 True Labrador_retriever 0.105506 True 28996 4517 6997101 134700
17 888804989199671297 2017-07-22 16:56:37 +0000 <a href="http://twitter.com/download/iphone" r... This is Zeke. He has a new stick. Very proud o... NaN https://twitter.com/dog_rates/status/888804989... 13 Zeke https://pbs.twimg.com/media/DFWra-3VYAA2piG.jpg 1 ... 0.184172 True English_setter 0.073482 True 25527 4363 6997101 134700
18 888554962724278272 2017-07-22 00:23:06 +0000 <a href="http://twitter.com/download/iphone" r... This is Ralphus. He's powering up. Attempting ... NaN https://twitter.com/dog_rates/status/888554962... 13 Ralphus https://pbs.twimg.com/media/DFTH_O-UQAACu20.jpg 3 ... 0.166511 True malamute 0.111411 True 19850 3596 6997101 134700
19 888078434458587136 2017-07-20 16:49:33 +0000 <a href="http://twitter.com/download/iphone" r... This is Gerald. He was just told he didn't get... NaN https://twitter.com/dog_rates/status/888078434... 12 Gerald https://pbs.twimg.com/media/DFMWn56WsAAkA7B.jpg 1 ... 0.000932 True bull_mastiff 0.000903 True 21702 3511 6997101 134700
20 887705289381826560 2017-07-19 16:06:48 +0000 <a href="http://twitter.com/download/iphone" r... This is Jeffrey. He has a monopoly on the pool... NaN https://twitter.com/dog_rates/status/887705289... 13 Jeffrey https://pbs.twimg.com/media/DFHDQBbXgAEqY7t.jpg 1 ... 0.087582 True Weimaraner 0.026236 True 30108 5414 6997101 134700
21 887517139158093824 2017-07-19 03:39:09 +0000 <a href="http://twitter.com/download/iphone" r... I've yet to rate a Venezuelan Hover Wiener. Th... NaN https://twitter.com/dog_rates/status/887517139... 14 None https://pbs.twimg.com/ext_tw_video_thumb/88751... 1 ... 0.029175 False shopping_cart 0.026321 False 46119 11713 6997101 134700
22 887473957103951883 2017-07-19 00:47:34 +0000 <a href="http://twitter.com/download/iphone" r... This is Canela. She attempted some fancy porch... NaN https://twitter.com/dog_rates/status/887473957... 13 Canela https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg 2 ... 0.054950 True beagle 0.038915 True 68935 18305 6997101 134700
23 887343217045368832 2017-07-18 16:08:03 +0000 <a href="http://twitter.com/download/iphone" r... You may not have known you needed to see this ... NaN https://twitter.com/dog_rates/status/887343217... 13 None https://pbs.twimg.com/ext_tw_video_thumb/88734... 1 ... 0.275645 False Weimaraner 0.134203 True 33582 10446 6997101 134700
24 887101392804085760 2017-07-18 00:07:08 +0000 <a href="http://twitter.com/download/iphone" r... This... is a Jubilant Antarctic House Bear. We... NaN https://twitter.com/dog_rates/status/887101392... 12 None https://pbs.twimg.com/media/DE-eAq6UwAA-jaE.jpg 1 ... 0.035029 True Staffordshire_bullterrier 0.029705 True 30451 5984 6997101 134700
25 886983233522544640 2017-07-17 16:17:36 +0000 <a href="http://twitter.com/download/iphone" r... This is Maya. She's very shy. Rarely leaves he... NaN https://twitter.com/dog_rates/status/886983233... 13 Maya https://pbs.twimg.com/media/DE8yicJW0AAAvBJ.jpg 2 ... 0.143528 True can_opener 0.032253 False 35058 7806 6997101 134700
26 886736880519319552 2017-07-16 23:58:41 +0000 <a href="http://twitter.com/download/iphone" r... This is Mingus. He's a wonderful father to his... NaN https://www.gofundme.com/mingusneedsus,https:/... 13 Mingus https://pbs.twimg.com/media/DE5Se8FXcAAJFx4.jpg 1 ... 0.186136 True Dandie_Dinmont 0.086346 True 12037 3311 6997101 134700
27 886680336477933568 2017-07-16 20:14:00 +0000 <a href="http://twitter.com/download/iphone" r... This is Derek. He's late for a dog meeting. 13... NaN https://twitter.com/dog_rates/status/886680336... 13 Derek https://pbs.twimg.com/media/DE4fEDzWAAAyHMM.jpg 1 ... 0.139952 False car_wheel 0.044173 False 22362 4486 6997101 134700
28 886366144734445568 2017-07-15 23:25:31 +0000 <a href="http://twitter.com/download/iphone" r... This is Roscoe. Another pupper fallen victim t... NaN https://twitter.com/dog_rates/status/886366144... 12 Roscoe https://pbs.twimg.com/media/DE0BTnQUwAApKEH.jpg 1 ... 0.000361 True Boston_bull 0.000076 True 21142 3208 6997102 134700 pupper
29 886258384151887873 2017-07-15 16:17:19 +0000 <a href="http://twitter.com/download/iphone" r... This is Waffles. His doggles are pupside down.... NaN https://twitter.com/dog_rates/status/886258384... 13 Waffles https://pbs.twimg.com/media/DEyfTG4UMAE4aE9.jpg 1 ... 0.025286 False Siamese_cat 0.002849 False 27905 6316 6997102 134700
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1963 666411507551481857 2015-11-17 00:24:19 +0000 <a href="http://twitter.com/download/iphone" r... This is quite the dog. Gets really excited whe... NaN https://twitter.com/dog_rates/status/666411507... 2 None https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg 1 ... 0.271485 False gar 0.189945 False 447 328 6997189 134700
1964 666407126856765440 2015-11-17 00:06:54 +0000 <a href="http://twitter.com/download/iphone" r... This is a southern Vesuvius bumblegruff. Can d... NaN https://twitter.com/dog_rates/status/666407126... 7 None https://pbs.twimg.com/media/CT-NvwmW4AAugGZ.jpg 1 ... 0.244220 True flat-coated_retriever 0.173810 True 110 41 6997189 134700
1965 666396247373291520 2015-11-16 23:23:41 +0000 <a href="http://twitter.com/download/iphone" r... Oh goodness. A super rare northeast Qdoba kang... NaN https://twitter.com/dog_rates/status/666396247... 9 None https://pbs.twimg.com/media/CT-D2ZHWIAA3gK1.jpg 1 ... 0.009397 True papillon 0.004577 True 167 86 6997189 134700
1966 666373753744588802 2015-11-16 21:54:18 +0000 <a href="http://twitter.com/download/iphone" r... Those are sunglasses and a jean jacket. 11/10 ... NaN https://twitter.com/dog_rates/status/666373753... 11 None https://pbs.twimg.com/media/CT9vZEYWUAAlZ05.jpg 1 ... 0.259551 True briard 0.206803 True 190 93 6997189 134700
1967 666362758909284353 2015-11-16 21:10:36 +0000 <a href="http://twitter.com/download/iphone" r... Unique dog here. Very small. Lives in containe... NaN https://twitter.com/dog_rates/status/666362758... 6 None https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg 1 ... 0.002402 False hamster 0.000461 False 778 574 6997189 134700
1968 666353288456101888 2015-11-16 20:32:58 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a mixed Asiago from the GalƔpagos... NaN https://twitter.com/dog_rates/status/666353288... 8 None https://pbs.twimg.com/media/CT9cx0tUEAAhNN_.jpg 1 ... 0.147655 True Eskimo_dog 0.093412 True 221 73 6997190 134700
1969 666345417576210432 2015-11-16 20:01:42 +0000 <a href="http://twitter.com/download/iphone" r... Look at this jokester thinking seat belt laws ... NaN https://twitter.com/dog_rates/status/666345417... 10 None https://pbs.twimg.com/media/CT9Vn7PWoAA_ZCM.jpg 1 ... 0.054787 True Labrador_retriever 0.014241 True 299 139 6997190 134700
1970 666337882303524864 2015-11-16 19:31:45 +0000 <a href="http://twitter.com/download/iphone" r... This is an extremely rare horned Parthenon. No... NaN https://twitter.com/dog_rates/status/666337882... 9 None https://pbs.twimg.com/media/CT9OwFIWEAMuRje.jpg 1 ... 0.278407 True groenendael 0.102643 True 198 92 6997190 134700
1971 666293911632134144 2015-11-16 16:37:02 +0000 <a href="http://twitter.com/download/iphone" r... This is a funny dog. Weird toes. Won't come do... NaN https://twitter.com/dog_rates/status/666293911... 3 None https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg 1 ... 0.015250 False great_grey_owl 0.013207 False 509 357 6997190 134700
1972 666287406224695296 2015-11-16 16:11:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an Albanian 3 1/2 legged Episcopalian... NaN https://twitter.com/dog_rates/status/666287406... 1 None https://pbs.twimg.com/media/CT8g3BpUEAAuFjg.jpg 1 ... 0.063064 True miniature_poodle 0.025581 True 149 66 6997190 134700
1973 666273097616637952 2015-11-16 15:14:19 +0000 <a href="http://twitter.com/download/iphone" r... Can take selfies 11/10 https://t.co/ws2AMaNwPW NaN https://twitter.com/dog_rates/status/666273097... 11 None https://pbs.twimg.com/media/CT8T1mtUwAA3aqm.jpg 1 ... 0.111884 True basenji 0.111152 True 176 76 6997190 134700
1974 666268910803644416 2015-11-16 14:57:41 +0000 <a href="http://twitter.com/download/iphone" r... Very concerned about fellow dog trapped in com... NaN https://twitter.com/dog_rates/status/666268910... 10 None https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg 1 ... 0.085547 False bookcase 0.079480 False 104 35 6997190 134700
1975 666104133288665088 2015-11-16 04:02:55 +0000 <a href="http://twitter.com/download/iphone" r... Not familiar with this breed. No tail (weird).... NaN https://twitter.com/dog_rates/status/666104133... 1 None https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg 1 ... 0.033919 False partridge 0.000052 False 14346 6634 6997190 134700
1976 666102155909144576 2015-11-16 03:55:04 +0000 <a href="http://twitter.com/download/iphone" r... Oh my. Here you are seeing an Adobe Setter giv... NaN https://twitter.com/dog_rates/status/666102155... 11 None https://pbs.twimg.com/media/CT54YGiWUAEZnoK.jpg 1 ... 0.149842 True borzoi 0.133649 True 80 13 6997190 134700
1977 666099513787052032 2015-11-16 03:44:34 +0000 <a href="http://twitter.com/download/iphone" r... Can stand on stump for what seems like a while... NaN https://twitter.com/dog_rates/status/666099513... 8 None https://pbs.twimg.com/media/CT51-JJUEAA6hV8.jpg 1 ... 0.166192 True Dandie_Dinmont 0.089688 True 156 68 6997190 134700
1978 666094000022159362 2015-11-16 03:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This appears to be a Mongolian Presbyterian mi... NaN https://twitter.com/dog_rates/status/666094000... 9 None https://pbs.twimg.com/media/CT5w9gUW4AAsBNN.jpg 1 ... 0.078260 True malinois 0.075628 True 164 74 6997190 134700
1979 666082916733198337 2015-11-16 02:38:37 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a well-established sunblockerspan... NaN https://twitter.com/dog_rates/status/666082916... 6 None https://pbs.twimg.com/media/CT5m4VGWEAAtKc8.jpg 1 ... 0.404722 True French_bulldog 0.048960 True 119 45 6997190 134700
1980 666073100786774016 2015-11-16 01:59:36 +0000 <a href="http://twitter.com/download/iphone" r... Let's hope this flight isn't Malaysian (lol). ... NaN https://twitter.com/dog_rates/status/666073100... 10 None https://pbs.twimg.com/media/CT5d9DZXAAALcwe.jpg 1 ... 0.175382 True Ibizan_hound 0.097471 True 322 164 6997190 134700
1981 666071193221509120 2015-11-16 01:52:02 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a northern speckled Rhododendron.... NaN https://twitter.com/dog_rates/status/666071193... 9 None https://pbs.twimg.com/media/CT5cN_3WEAAlOoZ.jpg 1 ... 0.174201 True Pekinese 0.109454 True 148 62 6997190 134700
1982 666063827256086533 2015-11-16 01:22:45 +0000 <a href="http://twitter.com/download/iphone" r... This is the happiest dog you will ever see. Ve... NaN https://twitter.com/dog_rates/status/666063827... 10 None https://pbs.twimg.com/media/CT5Vg_wXIAAXfnj.jpg 1 ... 0.093718 True Labrador_retriever 0.072427 True 476 219 6997190 134700
1983 666058600524156928 2015-11-16 01:01:59 +0000 <a href="http://twitter.com/download/iphone" r... Here is the Rand Paul of retrievers folks! He'... NaN https://twitter.com/dog_rates/status/666058600... 8 None https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg 1 ... 0.192305 True soft-coated_wheaten_terrier 0.082086 True 112 57 6997190 134700
1984 666057090499244032 2015-11-16 00:55:59 +0000 <a href="http://twitter.com/download/iphone" r... My oh my. This is a rare blond Canadian terrie... NaN https://twitter.com/dog_rates/status/666057090... 9 None https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg 1 ... 0.014594 False golden_retriever 0.007959 True 298 142 6997190 134700
1985 666055525042405380 2015-11-16 00:49:46 +0000 <a href="http://twitter.com/download/iphone" r... Here is a Siberian heavily armored polar bear ... NaN https://twitter.com/dog_rates/status/666055525... 10 None https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg 1 ... 0.058279 True fur_coat 0.054449 False 434 252 6997190 134700
1986 666051853826850816 2015-11-16 00:35:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an odd dog. Hard on the outside but lo... NaN https://twitter.com/dog_rates/status/666051853... 2 None https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg 1 ... 0.045885 False terrapin 0.017885 False 1223 853 6997190 134700
1987 666050758794694657 2015-11-16 00:30:50 +0000 <a href="http://twitter.com/download/iphone" r... This is a truly beautiful English Wilson Staff... NaN https://twitter.com/dog_rates/status/666050758... 10 None https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg 1 ... 0.263788 True Greater_Swiss_Mountain_dog 0.016199 True 132 58 6997190 134700
1988 666049248165822465 2015-11-16 00:24:50 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a 1949 1st generation vulpix. Enj... NaN https://twitter.com/dog_rates/status/666049248... 5 None https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 ... 0.243682 True Doberman 0.154629 True 109 41 6997190 134700
1989 666044226329800704 2015-11-16 00:04:52 +0000 <a href="http://twitter.com/download/iphone" r... This is a purebred Piers Morgan. Loves to Netf... NaN https://twitter.com/dog_rates/status/666044226... 6 None https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 ... 0.360687 True miniature_pinscher 0.222752 True 299 141 6997190 134700
1990 666033412701032449 2015-11-15 23:21:54 +0000 <a href="http://twitter.com/download/iphone" r... Here is a very happy pup. Big fan of well-main... NaN https://twitter.com/dog_rates/status/666033412... 9 None https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 ... 0.138584 True bloodhound 0.116197 True 125 44 6997190 134700
1991 666029285002620928 2015-11-15 23:05:30 +0000 <a href="http://twitter.com/download/iphone" r... This is a western brown Mitsubishi terrier. Up... NaN https://twitter.com/dog_rates/status/666029285... 7 None https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 ... 0.074192 True Rhodesian_ridgeback 0.072010 True 129 47 6997190 134700
1992 666020888022790149 2015-11-15 22:32:08 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a Japanese Irish Setter. Lost eye... NaN https://twitter.com/dog_rates/status/666020888... 8 None https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 ... 0.156665 True Shetland_sheepdog 0.061428 True 2560 517 6997190 134700

1993 rows × 24 columns

In [106]:
#drwaing a scatter plot with axes as tweet_id and No. of retweets.
df_master.plot(kind = 'scatter', x = 'tweet_id', y = 'retweets', alpha = 0.5, color = 'red')
plt.xlabel('tweet_id')
plt.ylabel('retweets')
plt.title('Tweet Id vs Retweet Scatter plot')
Out[106]:
Text(0.5,1,'Tweet Id vs Retweet Scatter plot')
In [107]:
#drawing a scatter plot with tweet_id and no. of user-favorites as axes.
df_master.plot(kind = 'scatter', x = 'tweet_id', y = 'user_favourites', alpha = 1, color = 'red')
plt.xlabel('tweet_id')
plt.ylabel('user_favourites')
plt.title('Tweet and user favorites Scatter plot')
Out[107]:
Text(0.5,1,'Tweet and user favorites Scatter plot')
In [108]:
#draw bar chart for 5 most favourite tweets
test = df_master.sort_values(['favorites'], ascending=False)
test1 = test._slice(slice(0, 5))

test1.plot(x='tweet_id', y='favorites', kind='bar')
Out[108]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4fb912def0>
In [109]:
#top 10 most retweeted tweets
#draw bar chart for 5 most favourite tweets
test = df_master.sort_values(['retweets'], ascending=False)
test1 = test._slice(slice(0, 5))

test1.plot(x='tweet_id', y='retweets', kind='bar')
Out[109]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4fb9137b38>
In [110]:
#Display top 5 dog breeds
df_master['category2'].value_counts().head(5)
Out[110]:
Labrador_retriever          96
golden_retriever            82
Cardigan                    72
Chihuahua                   43
Chesapeake_Bay_retriever    40
Name: category2, dtype: int64
In [111]:
#take top 5 breeds and name all other breeds as 'Others' and then draw a pie chart on breed breakup
df1 = df_master.copy()
df1.loc[(df1['category2'] != 'Labrador_retriever') & (df1['category2'] != 'golden_retriever')
         & (df1['category2'] != 'Cardigan')
         & (df1['category2'] != 'Chihuahua')
         & (df1['category2'] != 'Pomeranian')
            , 'category2'] = 'Others'
In [112]:
df1

# plot chart for dog breed type distribution
# 
df1[df1['category2'].notnull()]['category2'].value_counts().plot(kind = 'pie', autopct='%1.1f%%')
plt.title('Dog breed distribution')
Out[112]:
Text(0.5,1,'Dog breed distribution')
In [113]:
# demonstration of using seaborn:plot scatter plot for retweets

sns.lmplot('tweet_id', 'retweets', data=df_master, fit_reg=False)
Out[113]:
<seaborn.axisgrid.FacetGrid at 0x7f4fbb0eb668>

**visualization section ends here....

Comments

Popular posts from this blog

The journey of a thousand miles..

"The journey of thousand miles begins with one step", this famous quote by Lao Tzu explains my current state of mind very well. And that journey, which I am embarking on, is the field of data science. This blog whose sole purpose is to share my growth from infancy to maturity , is going to be the testimony of my growth, my ups and downs and all the relevant experiences. Wish me good luck! 

Pandas cheat sheet

In my learning so far , I have observed that NumPy, Pandas and matplotlib are core parts of Python which are going to help in data analysis. Here are some of the Pandas code snippets which I tested myself in Jupyter notebook. ==>to figure out number of duplicate rows df1['is_duplicated'] = df1.duplicated(['col1', 'Col2'.....'coln']) print(df1['is_duplicated'].sum()) ==> to figure out rows with missing values .. sum(df1.apply(lambda x: sum(x.isnull().values), axis = 1)>0) ==> to figure out unique values for a column        np.unique(df1['column name']) ==>to drop a column df.drop(['col_name', axis=1, inplace=True) ==> to replace a column df_08.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True) ==> to rename a column df = df.rename(columns={'old name': 'new name'}) ==> replace spaces with underscore df.rename(columns=lamb