#!/usr/bin/env python # coding: utf-8 # # RickyRenuncia Project # # The team of the RickyRenuncia Project managed multiple adquicision procedures to preserve the incidents that occured during the summer 2019 related to the leave of office of Ex-governor Ricardo Rosello Nevarez. # # # ## Physical Media vs Digital Donations # # The team collected artifacts and banners used during the demonstrations. When ever possible the artifacts where accompanied by audio interview and/or photograph of the demonstrators that produced and used this artifacts. Through social media and online word-of-mouth the team also contacted the community requesting imagery and content related to the activities of that summer. # # ## Twitter Data Collection # # In order to have a broad view of the many activities and demonstratiosn around the globe, one of the team members, Joel Blanco, decided to capture records of tweet activity in the web. This data was captured life during the days of the incident and requires processing and analysis to provide a valid interpretation of the information adquired. # # A cleaned version of this dataset occupies over `7 gigabytes` but fits into `777 megabytes` when compressed using `gzip`. Full text data can generally be easily compressed. Bellow we calculate the benefit of compressing this specific dataset. # # Notebook Requirements # # Before utilizing this notebook the user will need initialize environment variables as specified at [Developer_Registration.ipynb](./Developer_Registration.ipynb) # In[ ]: # Calculare the storage benefits of compression # Observations original_size_G = 7 final_size_M = 777 # Unit transformation giga_to_mega_rate = 1024.0 original_size_M = original_size_G * giga_to_mega_rate # Calculate percent change new_size_to_old_size = final_size_M / original_size_M new_size_percent = new_size_to_old_size * 100.0 space_freed_percent = 100 - new_size_percent print( "The storage was reduced to {:.1f}%.\nAfter compression, {:.1f}% from the originaly occupied space was freed.". format(new_size_percent, space_freed_percent) ) # The benefits can be very big specially for long term storage. # # Twitter Data: What we collected? # # It is important to understand the type of data that is collected from a social media API (application programable interface). The file `Data/Joel/tweetsRickyRenuncia-final.jsonl` is of jsonl format. If you are familiar with json files then this format is a composition of multiple `json` strings each in a new line, the 'L' stands for line (`jsonl = json-l = json-line`). # # This data set was collected from Twitter in 2019. The Twitter API rescently went through an update, however this data uses the previous API conventions. We will use Pythons `json` library to parse a random line from the source data to help you visualize the structure of this data. Observe that some of the content is readily availble (text field), while others are harder to parse (media url). # # The full list of tweet ids is available [here](https://ia601005.us.archive.org/31/items/tweetsRickyRenuncia-final/tweetsRickyRenuncia-final.txt). # # Bellow we show how a try/except and while loops can be used to loop through the data until a post with images is found. # In[ ]: dir_path = os.getcwd() print(dir_path) #os.chdir("/home/torrien/") #dir_path = os.getcwd() #print(dir_path) #print(os.listdir()) #print(os.listfile()) JL_DATA="/home/rickyrenuncia/tweetsRickyRenuncia-final.jsonl" # Get the SAMPLE_SIZE SAMPLE_SIZE = 0. with open(JL_DATA, "r") as data_handler: for line in data_handler: if line != "\n": SAMPLE_SIZE += 1. print(f"Sample Size:{int(SAMPLE_SIZE)}\n\n") # Get a random integer to skip before taking single sample # Try seeds 1 and 16 or any you want to test seed(1) skip_lines=randint(0,int(SAMPLE_SIZE-1)) # Reopen file using the with-open-as style and print out a single sample with open(JL_DATA, 'r') as data_handler: # Use next to skip a line, the for loop allows skipping multiple lines for _ in range(skip_lines): next(data_handler) while True: # Loop until a tweet with media. try: # Capture string raw_data = data_handler.readline() # Verify if the json has any 'meda_url_https' keys. if 'media_url_https' not in raw_data: continue data = json.loads(raw_data) except: break try: i = 0 while True: try: media_url = data['retweeted_status']['entities']['media'][i]['media_url_https'] except: i += 1 if i > 10: media_url = "Could not quickly find a tweet with media." raise #Pass error to previous try/except. continue break except: continue print("Text:", data['text']) # The Tweet URL is a twitter convention where both the tweet ID and the user's screen_name are required to access the status. print("Tweet URL using user's screen_name:", f"https://twitter.com/{data['user']['screen_name']}/status/{data['id_str']}") print("Tweet URL using user's ID :", f"https://twitter.com/{data['user']['id_str']}/status/{data['id_str']}") print("Media:", media_url) # print(f"In replay to: {json.dumps(data['retweeted_status'], indent=1)}") print("\n") # The indent and sort_keys in json.dumps "prettify" the output. Still not pretty. # print("Raw Data:") # print("#"*50) # print(json.dumps(data, indent=4, sort_keys=True)) # print("#"*50) break # retweeted_posts, handler)## Study the old Twitter API # Documentation on the old twitter API version 1.1 can be found [here](https://developer.twitter.com/en/docs/twitter-api/v1) and a sample [over here](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/sample-realtime/api-reference/get-statuses-sample). # # What data is available # # As data analysts we need to understand the data before we can set goals. # In[ ]: SAMPLE_SIZE = 1113758 data = TweetJLAnalyzer(JL_DATA, reset=True, local_media=False, cache_size=2000) size=getsizeof(data) print(str(size)) print(str(size/1024.0)) # In[ ]: most_retweeted_media = data.get_most_retweeted_media(40) # In[ ]: print("Ammount found: ", len(most_retweeted_media)) for rt_count, m_id, m in most_retweeted_media[15:21]: print(m) print("*"*20 + "\n" + str(rt_count) + " - " + str(m_id) + "\n" + "*"*20 + "\n\n") # In[ ]: most_retweeted_posts = data.get_most_retweeted(100,has_media=True) # In[ ]: # Save populars posts with open("100_most_retweeted_posts.pickle",'wb') as handler: pickle.dump(most_retweeted_posts, handler) # In[ ]: # Recall popular posts with open("100_most_retweeted_posts.pickle",'rb') as handler: most_retweeted_posts = pickle.load(handler) # In[ ]: import random print("Ammount found: ", len(most_retweeted_posts)) for rt_count, tweet_id, key in random.sample(most_retweeted_posts[11:21], 10): tweet = data.fetch_by_id(tweet_id) if "renuncia" in tweet.data["text"].lower() or "puerto rico" in tweet.data["text"].lower() or "ricky" in tweet.data["text"].lower() or "rosell" in tweet.data["text"].lower(): print(tweet) print("*"*20 + "\n" + str(rt_count) + " - " + str(tweet_id) + " - " + str(key) + "\n" + "*"*20 + "\n\n") else: # print(tweet.data["text"]) print(tweet) print("*"*10 + "\n" + str(rt_count) + " - " + str(tweet_id) + " - " + str(key) + "\n\n") # In[ ]: # randint(0,SAMPLE_SIZE-6) # print(data.head(5, 40, sep="\n" + "*"*100 + "\n\n")) #RickyRenuncia #RickyVeteYa print(data.head(5, randint(0,SAMPLE_SIZE-6), sep="\n" + "*"*100 + "\n\n")) # In[ ]: print(data.head(2, sep="\n*************\n")) # In[ ]: print(type(data.retweet_cache)) print(str(data.retweet_cache.keys())[:400]) print(str(data.retweet_cache)[:400]) # In[ ]: print(data.retweet_cache[0][0]) print(str(data.quoteOf)[:400]) print(str(data.retweetOf)[:400]) print(str(data.retweet_cache)[:400]) retweet_counts = list(data.retweet_cache.keys()) retweet_counts.sort(reverse=True) quote_counts = list(data.quote_cache.keys()) quote_counts.sort(reverse=True) print(str(retweet_counts)[:400]) print(str(quote_counts)[:400]) # In[ ]: sample_t = data.fetch_by_position(112) print(json.dumps(sample_t.data, indent=4)) # In[ ]: # Find a video tweet SAMPLE_SIZE = 1113758 count = 0 media_ids=[] with open(JL_DATA,'r') as data_file: for _ in range(SAMPLE_SIZE): count+=1 if count%200000 == 0: print(f"Done with: {count}") tweet = TweetAnalyzer(data_file.readline()) if tweet.hasMedia: # print("HasMedia",tweet.hasMedia) if len(tweet.media) > 0: for m in tweet.media: if m.mtype().lower() != "photo" and m.id not in media_ids: media_ids.append(m.id) print(m.id, m.mtype(), m.url()) # print(m.data) else: print("Length 0??") try: print(tweet.data["entities"]["media"]) except: print("No Media at HERE") try: print(tweet.data["retweeted_status"]["entities"]["media"]) except: print("No Media at RETWEET_STATUS") print(json.dumps(tweet.data)) break print(f"DONE: {count}") # ## Beautiful Imagery #

Title

Foot Notes

# # In[ ]: # In[ ]: def h(p, q): return (p, q) interact(h, p=10, q=fixed(20)) # In[ ]: interact(f, x=IntSlider(min=0, max=30, step=1, value=15)) # In[ ]: @interact(x=(0.0,20.0,0.5)) def h(x=5.5): return x # In[ ]: @interact(x=(8,20)) def aTitle(x=12): display(HTML(f"

Hello!

")) # In[ ]: import ipywidgets as widgets from IPython.display import display button = widgets.Button(description="Click Me!") output = widgets.Output() display(button, output) output.my_n = 0 def on_button_clicked(b): with output: output.clear_output() output.my_n+=1 print(f"Button clicked. {output.my_n}") button.on_click(on_button_clicked) # In[1]: # Adding Required Libraries import ipywidgets as widgets from IPython.core.display import display, HTML, update_display import json, os, pickle from random import seed, randint from tweet_rehydrate.analysis import TweetJLAnalyzer, TweetAnalyzer, getsizeof from tweet_rehydrate.display import TweetInteractiveClassifier, JsonLInteractiveClassifier, TSess, prepare_google_credentials from twitter_secrets import C_API_KEY, C_API_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET, C_BEARER_TOKEN JL_DATA="/home/rickyrenuncia/tweetsRickyRenuncia-final.jsonl" # In[3]: tweet_session = TSess( C_BEARER_TOKEN, compression_level=5, sleep_time=3, cache_dir="./.tweet_cache_split/", hash_split=True ) google_credentials = prepare_google_credentials(credentials_file="./google_translate_keys.json") # In[3]: # jl_display = JsonLInteractiveClassifier( # tweet_ids_file="tweetsRickyRenuncia-final.txt", # session=tweet_session, mute=False) # Flier Boletin Promocion # 30 de Abril jl_display = JsonLInteractiveClassifier( tweet_ids_file="tweetsRickyRenuncia-final.txt", session=tweet_session,pre_initialized=True, sqlite_db = ".tweetsRickyRenuncia-final.txt.db") # In[1]: jl_display.display_another() # In[5]: test_tweet = TweetInteractiveClassifier(tweet_id="1150943952616468486", session=tweet_session) # In[4]: test_tweet.url() # In[7]: # output = widgets.Output() html = test_tweet.oEmbeded() # print(html) # with output: display(HTML(html)) # In[5]: print(test_tweet.text()) print(test_tweet.hasMedia) print(test_tweet.hasLocalMedia) print(test_tweet.data.keys()) print(test_tweet.data.get("entities", {})) print(test_tweet.data.get("extended_entities", {})) # In[ ]: (test_tweet.url(),test_tweet.isRetweet, test_tweet.retweeted_status.url()) # In[ ]: test_tweet.display() # In[6]: test_tweet.data.keys() # In[ ]: test_tweet.data[]