Twitter Streaming

Stream Listener of Twitter

This is a fast and easy example about how download twitter data.

Imports

import tweepy
from tweepy import Stream
import pandas as pd
import json
from datetime import datetime

Functions

def get_tweet_type(tweet):
    source_tweet = False
    type_tw = "TW"
    try:
        tmp = tweet['retweeted_status']
        source_tweet = tmp['id']
        type_tw = "RT"
    except:
        pass
    try:
        tmp = tweet['quoted_status']
        source_tweet = tmp['id']
        type_tw = "QT"
    except:
        pass
    try:
        tmp = tweet['in_reply_to_status_id']
        if tmp != None and type == "TW":
            type_tw = "RP"
            source_tweet = tmp
    except:
        pass
    return type_tw, source_tweet

Class

  • Twitter Stream Listener

class StdOutListener(Stream):
    """
    This is a listener that just prints received tweets to stdout.
    """
    def __init__(self, fetched_tweets_filename):
        self.fetched_tweets_filename = fetched_tweets_filename

    def on_data(self, data):
        try:
            print(data)
            data = json.loads(data)
            if None == data["in_reply_to_screen_name"]:
                in_reply_to_screen_name = 'nan'
            else:
                in_reply_to_screen_name = str(data["in_reply_to_screen_name"])
                
            try:
                ls_dict = data['entities']['user_mentions']
                user_mentions = '~'.join( [ v_dict['screen_name'] for v_dict in ls_dict ] )
            except:
                user_mentions = 'nan'
                
            try:
                ls_dict = data['entities']['hashtags']
                hash_mentions = '~'.join( [ v_dict['text'] for v_dict in ls_dict ] )
            except:
                hash_mentions = 'nan'
            
            text = (data["text"].strip()).replace('\n','')     
            created = data["created_at"]
            created = datetime.strftime(datetime.strptime(created, '%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
            type_tw, source_tweet = get_tweet_type(data)
            retweets = data['retweet_count']
            favorites = data['favorite_count']
            quote_counts = data["quote_count"]
            lang = data['lang']
            u_id = data["user"]["id_str"]
            u_screen_name = data["user"]["screen_name"]
            u_followers = data["user"]["followers_count"]
            u_followings = data["user"]["friends_count"]
            u_location = data['user']['location']

            if (data['user']['description']) != None:
                u_bio = (data['user']['description']).replace('\n','')
            else:
                u_bio = 'nan'
            u_created = data['user']['created_at'] 
            u_created = datetime.strftime(datetime.strptime(u_created,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')

            new_ls = [ created , text, type_tw, retweets, favorites, quote_counts,lang,
                     in_reply_to_screen_name, user_mentions, hash_mentions, 
                     u_id, u_screen_name, u_followers, u_followings, u_location, u_bio, u_created]
            new_ls = [ 'nan' if v == None else v for v in new_ls ]
            str_data = '~|~'.join(map(str, new_ls))
                        
            with open(self.fetched_tweets_filename, 'a') as tf:
                tf.write(str_data+'\n')

            return True
        except BaseException as e:
            print("Error on_data %s" % str(e))
        return True
          
    def on_error(self, status):
        print(status)
  • Twitter Streamer

class TwitterStreamer():
    """
    Class for streaming and processing live tweets and filter by a location and hashtags.
    """
    def __init__(self):
        pass

    def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
        # This handles Twitter authetification and the connection to Twitter Streaming API
        listener = StdOutListener(fetched_tweets_filename)
        auth = tweepy.OAuthHandler("*******", "*******")
        auth.set_access_token( "*******", "*******")
        stream = Stream(auth, listener)

        # This line attempt to filter Twitter Streams and capture data by the keywords/hashtags: 
        stream.filter(track=hash_tag_list,locations = [00, 00, 00, 00] )

The following section is executed from the terminal and can change some parameters such as:

  • File name
  • Column names
  • Hashtags
  • Location
  • Twitter Passwords
if __name__ == '__main__':

    hash_tag_list = ["********", "********", "********"]
    fetched_tweets_filename = "********.txt"
    
    twitter_streamer = TwitterStreamer()

    ls_columns_name = ["created_at","text","type_tw","retweets","favorites","quotes_count",'lang',
                    "in_reply_to_screen_name", 'users_mentions', 'hashtags_mentions',
                    "user_id", "user_screen_name","user_followers","user_followings","user_location",
                    "user_bio", "user_created_at"]
    str_colnames = '~|~'.join(ls_columns_name)

    with open(fetched_tweets_filename, 'w') as tf:
        tf.write(str_colnames)
        tf.write('\n')

    twitter_streamer.stream_tweets(fetched_tweets_filename, hash_tag_list)
Alex Amaguaya
Alex Amaguaya
Research Assistant / Data Scientist

My research interests are Networks and the intersection between Econometrics and Machine Learning.