Twitter Streaming
Stream Listener of Twitter
This is a fast and easy example about how download twitter data.
Imports
import tweepy
from tweepy import Stream
import pandas as pd
import json
from datetime import datetime
Functions
def get_tweet_type(tweet):
source_tweet = False
type_tw = "TW"
try:
tmp = tweet['retweeted_status']
source_tweet = tmp['id']
type_tw = "RT"
except:
pass
try:
tmp = tweet['quoted_status']
source_tweet = tmp['id']
type_tw = "QT"
except:
pass
try:
tmp = tweet['in_reply_to_status_id']
if tmp != None and type == "TW":
type_tw = "RP"
source_tweet = tmp
except:
pass
return type_tw, source_tweet
Class
Twitter Stream Listener
class StdOutListener(Stream):
"""
This is a listener that just prints received tweets to stdout.
"""
def __init__(self, fetched_tweets_filename):
self.fetched_tweets_filename = fetched_tweets_filename
def on_data(self, data):
try:
print(data)
data = json.loads(data)
if None == data["in_reply_to_screen_name"]:
in_reply_to_screen_name = 'nan'
else:
in_reply_to_screen_name = str(data["in_reply_to_screen_name"])
try:
ls_dict = data['entities']['user_mentions']
user_mentions = '~'.join( [ v_dict['screen_name'] for v_dict in ls_dict ] )
except:
user_mentions = 'nan'
try:
ls_dict = data['entities']['hashtags']
hash_mentions = '~'.join( [ v_dict['text'] for v_dict in ls_dict ] )
except:
hash_mentions = 'nan'
text = (data["text"].strip()).replace('\n','')
created = data["created_at"]
created = datetime.strftime(datetime.strptime(created, '%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
type_tw, source_tweet = get_tweet_type(data)
retweets = data['retweet_count']
favorites = data['favorite_count']
quote_counts = data["quote_count"]
lang = data['lang']
u_id = data["user"]["id_str"]
u_screen_name = data["user"]["screen_name"]
u_followers = data["user"]["followers_count"]
u_followings = data["user"]["friends_count"]
u_location = data['user']['location']
if (data['user']['description']) != None:
u_bio = (data['user']['description']).replace('\n','')
else:
u_bio = 'nan'
u_created = data['user']['created_at']
u_created = datetime.strftime(datetime.strptime(u_created,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
new_ls = [ created , text, type_tw, retweets, favorites, quote_counts,lang,
in_reply_to_screen_name, user_mentions, hash_mentions,
u_id, u_screen_name, u_followers, u_followings, u_location, u_bio, u_created]
new_ls = [ 'nan' if v == None else v for v in new_ls ]
str_data = '~|~'.join(map(str, new_ls))
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(str_data+'\n')
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
def on_error(self, status):
print(status)
Twitter Streamer
class TwitterStreamer():
"""
Class for streaming and processing live tweets and filter by a location and hashtags.
"""
def __init__(self):
pass
def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
# This handles Twitter authetification and the connection to Twitter Streaming API
listener = StdOutListener(fetched_tweets_filename)
auth = tweepy.OAuthHandler("*******", "*******")
auth.set_access_token( "*******", "*******")
stream = Stream(auth, listener)
# This line attempt to filter Twitter Streams and capture data by the keywords/hashtags:
stream.filter(track=hash_tag_list,locations = [00, 00, 00, 00] )
The following section is executed from the terminal and can change some parameters such as:
- File name
- Column names
- Hashtags
- Location
- Twitter Passwords
if __name__ == '__main__':
hash_tag_list = ["********", "********", "********"]
fetched_tweets_filename = "********.txt"
twitter_streamer = TwitterStreamer()
ls_columns_name = ["created_at","text","type_tw","retweets","favorites","quotes_count",'lang',
"in_reply_to_screen_name", 'users_mentions', 'hashtags_mentions',
"user_id", "user_screen_name","user_followers","user_followings","user_location",
"user_bio", "user_created_at"]
str_colnames = '~|~'.join(ls_columns_name)
with open(fetched_tweets_filename, 'w') as tf:
tf.write(str_colnames)
tf.write('\n')
twitter_streamer.stream_tweets(fetched_tweets_filename, hash_tag_list)