#!/usr/bin/env python # coding: utf-8 # # Update Database Structure # # Changes in the data stored and format will affect how the information is processed and stored. An update method was created to change the storage. # # # In[1]: import ipywidgets as widgets from IPython.core.display import display, HTML, update_display import json, os, pickle from random import seed, randint from tweet_requester.analysis import TweetAnalyzer from tweet_requester.display import TweetInteractiveClassifier, \ JsonLInteractiveClassifier, TSess, prepare_google_credentials, PROCESSING_STAGES, logging from twitter_secrets import C_BEARER_TOKEN JL_DATA="./tweetsRickyRenuncia-final.jsonl" BASE_DIR="./Evaluating Content" # Update database #April 30, 2021 the RR team rehydrated with twarc their data. april302021 = 1619755200.0 # git_commit="9219b7a01ce28f5bc0d61c913b3f914f967614fd" git_commit="2ac78595cceef98a56c518c24f2187360e1527e3" tweet_session = TSess( C_BEARER_TOKEN, compression_level=5, sleep_time=3, cache_dir="./tweet_cache/", hash_split=True ) google_credentials = prepare_google_credentials( credentials_file="./google_translate_keys.json" ) # In[2]: classifier = JsonLInteractiveClassifier( tweet_ids_file="tweetsRickyRenuncia-final.txt", session=tweet_session, mute=True, google_credentials=google_credentials, pre_initialized=True, sqlite_db="tweets.db" ) # In[3]: classifier.close() # In[4]: import logging logging.basicConfig(level=logging.WARNING) classifier.update_database_v01_v02(dateCreated=april302021, git_commit=git_commit) classifier.update_database_v02_v03(git_commit=git_commit) classifier.update_database_v03_v04(git_commit=git_commit) # In[4]: classifier.connect() cur = classifier.cursor() cur.execute(""" SELECT state, count(*) from tweet GROUP BY state ORDER BY state;""") rows = cur.fetchall() print("{:>25} | {:<8}".format("PROCESSING_STAGE", "COUNT")) print("{:>25} | {:<8}".format("-"*25, "-"*8)) for row in rows: print("{:>25} | {:<8}".format(PROCESSING_STAGES(row[0]).name, row[1])) cur.execute(""" SELECT * from tweet WHERE tweet_id in ( SELECT tweet_id FROM tweet WHERE state in (?));""", (PROCESSING_STAGES.PREPROCESSED.value,)) rows_sample = cur.fetchall() print("\n\nSample: ") n=0 cur.close() for row in rows_sample: print("\t",row) n+=1 if n > 4: break # In[5]: classifier.display_accepted(page=3, per_page=3) # In[5]: classifier.StartEvaluations() # In[6]: classifier.connect() cur = classifier.cursor() cur.execute(""" SELECT * from tweet WHERE tweet_id in ( SELECT tweet_id FROM tweet WHERE state in (?));""", (PROCESSING_STAGES.REVIEWING.value,)) rows = cur.fetchall() n=0 cur.close() for row in rows: print(row) classifier.tweet_set_state( tweet_id=row[0], state=PROCESSING_STAGES.UNPROCESSED ) n+=1 if n > 9: break # In[7]: page=5 per_page=5 classifier.display_accepted(page=page, per_page=per_page) # In[ ]: from datetime import datetime from time import sleep import logging last_pull=datetime.now().timestamp()-900 current_time=end = datetime.now().timestamp() while True: if current_time - last_pull > 900: start_pull = datetime.now().timestamp() try: classifier.preprocess_batch(n=150) except Exception as err: logging.error(err) break # Average the download time to the middle of the transaction. last_pull = (start_pull + datetime.now().timestamp())/2.0 else: current_time = datetime.now().timestamp() # sleep for time left for 15 minutes sleep(900 - (current_time - last_pull)) current_time = datetime.now().timestamp() # In[8]: classifier.preprocess_batch(n=250) # In[10]: # Install a pip package in the current Jupyter kernel import sys get_ipython().system('{sys.executable} -m pip install tweet-requester')