#!/usr/bin/env python # coding: utf-8 # # Using Schema.org to create RDF/JSON-LD relations using Twitter Data # # ## Curating a dataset for FAIR # # This module objective is to explore the usage of Schema.org, specifically the Social Media Posting and the Creative Work metadata template, to crosswalk the #RickyRenuncia Twitter dataset from a jsonl to an RDF/JSON-LD structure. # # ## From Unstructured Data to Structured Data # # The #RickyRenuncia Twitter dataset can be thought as an unstructured data. By using Schema.org metadata templates, a dataset can be structured preserving data relations that can enhace findability, interoperability, accesibility and reusability (FAIR) of this dataset for research purposes. # # [Schema.org](https://schema.org/) was founded by Google, Microsoft, Yahoo and Yandex, with the purpose of developing a standard vocabulary that allows different and/or seach engines use data from multiple sources. # # # Metadata Crosswalk: From JSONL to Social Media Posting from Schema.org # # ## Metadata Design # # The metadata vocabularies used in this project are extracted from Schema.org. By complementing the Social Media Posting and the Creative Work template the project was able to successfully create relations between Original Tweets, Retweets, Quoted Retweets and its shared media. # # An important metadata field used to allow this relation is the [isBasedOn](https://schema.org/isBasedOn) from Schema.org. The isBasedOn field is used to represent "A resource from which this work is derived or from which it is a modification or adaption" (from Schema.org Website). It accepts as a digital object a URL or an Item derived from a Creative Work template. # # The Resource templates created for this project attempt to preserve relevant information of the Tweet object. # # For this project, the relevant information is: # # * User identifier # * User Tweet Handle # * Date of the Tweet # * Text of the tweet # * Hashtags # * Other Twitter User Mentions # * Tweet Media # * If is based out of other tweet or not # # This project did not work on making explicit the relations between users, nor keep information relates to the amounts of retweets of likes. It has to be mention that the Social Media Posting template offers metadata fields for those types of information. # # ## Entities and Relations # # The types of entities in this collection are the author, tweet, media, and retweeted or quotes tweets. # # Items are related to each other in multiple ways, such as: # # If the item is a quoted tweet. -----> This relation takes form by using the isBasedOn metadata field. # If the item is a retweet? -----> This relation takes form by using the isBasedOn metadata field. # If both quoted tweet and retweet contains media? -----> This relation takes form by using the isBasedOn and sharedContent metadata field. # By author -----> This relation takes form by using an Omeka Resource, which was needed to create an resource template. # # # [crosswalk.jpg](RickyRenuncia-case-module_shared/crosswalk.jpg) # In[ ]: import requests from typing import List, Union from xml.etree import ElementTree as ET import re import json from tweet_rehydrate.analysis import TweetAnalyzer, TweetMedia IP = "3.235.43.90" def download_media(media: TweetMedia, filename: str): response = requests.get(media.url()) if response.status_code == 200: with open(filename, "wb") as tmpfile: for chunk in response.iter_content(): if chunk: tmpfile.write(chunk) return filename return None def create_tweet_omeka(tweet: Union[dict, TweetAnalyzer], item_set: int, url_api, key_identity, key_credential): if type(tweet) is not TweetAnalyzer: tweet = TweetAnalyzer(tweet) tweet._hasLocalMedia() results = seach_tweet_omeka(tweet, item_set, url_api) if len(results) == 1: return results[0] else: author = create_author_omeka( tweet.user_id, tweet.user_screen_name, item_set, url_api, key_identity, key_credential) headers = {'Content-type': 'application/json'} params = { "key_identity": key_identity, "key_credential": key_credential } all_media = [] all_media_fn = [] all_media_ft = [] all_media_index =[] media_count = 0 for media in tweet.media: tmp_name = f"tmps/file{media_count}" save_name = download_media(media, tmp_name) if save_name: all_media.append({ "o:ingester": "upload", "file_index": media_count, "o:item": {}, "schema:identifier": [ { "type": "literal", "property_id": 1623, "property_label": "identifier", # "is_public": true, "@value": media.url() } ] "dcterms:title": [{ "property_id": 1, "property_label": "Title", "@value": media.title(), "type": "literal"}] }) all_media_index.append(media_count) all_media_fn.append(save_name) all_media_ft.append(media.mtype()) media_count+=1 data = { "@type": [ "o:Item", "schema:SocialMediaPosting" ], "o:item_set": [{"o:id": item_set}], "o:resource_class": { "@id": f"http://{IP}/api/resource_classes/1460", "o:id": 1460 }, "o:resource_template": { "@id": f"http://{IP}/api/resource_templates/8", "o:id": 8 }, # "schema:isBasedOn": isBasedOn, "schema:identifier": [ { "type": "literal", "property_id": 1623, "property_label": "identifier", # "is_public": true, "@value": tweet.urlByIDs() } ], "schema:url": [ { "type": "literal", "property_id": 1763, "property_label": "url", # "is_public": true, "@value": tweet.url() } ], "schema:dateCreated": [ { "type": "literal", "property_id": 2520, "property_label": "dateCreated", # "is_public": true, "@value": tweet.data["created_at"] } ], "schema:author": [ { "type": "resource", "@id": author["@id"], "property_id": 1616, "property_label": "author", "value_resource_id": author["o:id"], "value_resource_name": "items", } ], "schema:text": [ { "type": "literal", "property_id": 2618, "property_label": "text", # "is_public": true, "@value": tweet.data["full_text"] } ], "schema:sharedContent": all_media, "schema:mentions": [ { "type": "literal", "property_id": 2089, "property_label": "mentions", # "is_public": true, "@value": uMention["screen_name"] } for uMention in tweet.data["entities"]["user_mentions"] ], "schema:keywords": [ { "type": "literal", "property_id": 2329, "property_label": "keywords", # "is_public": true, "@value": hashtag["text"] } for hashtag in tweet.data["entities"]["hashtags"] ], "o:media": all_media, } if tweet.isRetweet: retweeted = create_tweet_omeka( tweet.data["retweeted_status"], item_set, url_api, key_identity, key_credential ) isBasedOn = { "type": "resource", "@id": retweeted["@id"], "property_id": 2104, "property_label": "isBasedOn", "value_resource_id": retweeted["o:id"], "value_resource_name": "items" } data["schema:isBasedOn"] = [isBasedOn, ] if tweet.isQuote: quoted = create_tweet_omeka( tweet.data["quoted_status"], item_set, url_api, key_identity, key_credential) isBasedOn = { "type": "resource", "@id": quoted["@id"], "property_id": 2104, "property_label": "isBasedOn", "value_resource_id": quoted["o:id"], "value_resource_name": "items", } data["schema:isBasedOn"] = [isBasedOn, ] Files = [ ('data', (None, json.dumps(data), 'application/json')), # ('file[0]', (title, open("tmp_file.tmp", "rb"), 'image/jpg')) ] for idx in all_media_index: tmp_tuple = (f'file[{idx}]', (all_media_fn[idx].split("/")[-1], open(all_media_fn[idx],"rb"), all_media_ft[idx]) ) Files.append(tmp_tuple) response = requests.post( url=url_api+"items/", # data=json.dumps(data), params=params, # headers=headers, files=Files) if response.status_code == 200: # print(response.text) return json.loads(response.text) else: print("Failed to Create Tweet") return None def seach_tweet_omeka(tweet: Union[dict, TweetAnalyzer], item_set: int, url_api): if type(tweet) is not TweetAnalyzer: tweet = TweetAnalyzer(tweet) query = { "item_set": item_set, "property": [ { "property": 1623, # Schema:identifier= URL to tweet user account "type": "eq", "text": tweet.urlByIDs() } ] } response = requests.get(url=url_api+"items/", data=query) if response.status_code == 200: return json.loads(response.text) return None def create_author_omeka(ID, screen_name, item_set, url_api, key_identity, key_credential): results = search_author_omeka(ID, screen_name, item_set, url_api) if len(results) == 1: return results[0] else: headers = {'Content-type': 'application/json'} params = { "key_identity": key_identity, "key_credential": key_credential } data = { "@type": [ "o:Item", "schema:Person" ], "o:item_set": [{"o:id": item_set}], "o:resource_class": { "@id": "http://{IP}/api/resource_classes/1689", "o:id": 1689 }, "o:resource_template": { "@id": "http://{IP}/api/resource_templates/7", "o:id": 7 }, # "schema:identifier": f"https://twitter.com/{ID}", # "schema:alternateName": screen_name, "schema:identifier": [ { "type": "literal", "property_id": 1623, "property_label": "identifier", # "is_public": true, "@value": f"https://twitter.com/{screen_name}" }, { "type": "literal", "property_id": 1623, "property_label": "identifier", # "is_public": true, "@value": ID } ], "schema:alternateName": [ { "type": "literal", "property_id": 2700, "property_label": "alternateName", # "is_public": true, "@value": screen_name } ] } response = requests.post( url=url_api+"items/", data=json.dumps(data), params=params, headers=headers) if response.status_code == 200: # print(response.text) return json.loads(response.text) else: print("Failed to Create Author") return None def search_author_omeka(ID, screen_name, item_set, url_api): # http://3.237.93.184/api/items?item_set_id=3433&property[0][property]=2618&property[0][type]=eq&property[0][text]=popo # http://3.237.93.184/api/items?item_set_id=3433&property[0][property]=1623&property[0][type]=eq&property[0][text]=caca query = { "item_set": item_set, "property": [ { "property": 2700, # Schema:alternateName= Tweet Screen Name "type": "eq", "text": screen_name }, { "property": 1623, # Schema:identifier= URL to tweet user account "type": "eq", "text": f"https://twitter.com/{screen_name}" } ] } response = requests.get(url=url_api+"items/", data=query) if response.status_code == 200: return json.loads(response.text) return None def create_media_omeka(URL: str, item_set, url_api, key_identity, key_credential): results = search_media_omeka(URL, item_set, url_api) if len(results) == 1: return results[0] else: get_media = requests.get(URL) if get_media.status_code != 200: return None # file_binary=bytearray() with open("tmp_file.tmp", "wb") as tmp_file: for chunk in get_media.iter_content(): if chunk: tmp_file.write(chunk) # file_binary=bytes(file_binary) # file_binary = get_media.i title = URL.split("/")[-1].split("?")[0] headers = {'Content-type': 'application/json'} params = { "key_identity": key_identity, "key_credential": key_credential } data = { "@type": [ "o:Media", "schema:MediaObject" ], "o:item_set": [{"o:id": item_set}], "o:resource_class": { "@id": "http:\/\/3.235.43.90\/api\/resource_classes\/1821", "o:id": 1821 }, "o:resource_template": { "@id": "http:\/\/3.235.43.90\/api\/resource_templates\/11", "o:id": 11 }, "schema:identifier": [ { "type": "literal", "property_id": 1623, "property_label": "identifier", # "is_public": true, "@value": URL }, ], "o:ingester": "upload", "file_index": "0", "o:media": [ { "o:ingester": "upload", "file_index": "0", "o:item": {}, "dcterms:title": [ { "property_id": 1, "property_label": "Title", "@value": "My media upload title", "type": "literal" } ] } ], } print(title) files = [ ('data', (None, json.dumps(data), 'application/json')), ('file[0]', (title, open("tmp_file.tmp", "rb"), 'image/jpg')) ] response = requests.post(url=url_api+"media/", params=params, headers=headers, files=files) if response.status_code == 200: print(response.text) return json.loads(response.text) else: print("Failed to Create Media") print(response.text) return None def search_media_omeka(URL, item_set, url_api): query = { "item_set": item_set, "property": [ { "property": 1623, # Schema:identifier= URL to tweet user account "type": "eq", "text": URL } ] } response = requests.get(url=url_api+"media/", data=query) if response.status_code == 200: return json.loads(response.text) return None # http://3.237.93.184/api/items?item_set_id=3433&property[0][property]=2618&property[0][type]=eq&property[0][text]=popo def create_new_set(set_name: str, url_api: str, key_identity, key_credential): # Creates a new Omeka Item set and returns the sets metadata headers = {'Content-type': 'application/json'} data = { "dcterms:title": [ { "type": "literal", "property_label": "Title", "@value": set_name, "property_id": 1 } ] } params = { "key_identity": key_identity, "key_credential": key_credential } item_sets_api_url = url_api + "item_sets" response = requests.post( item_sets_api_url, params=params, data=json.dumps(data), headers=headers ) print(response.status_code) if response.status_code == 200: print(response.text) return json.loads(response.text) return None def main(): # Main execution loop. url_api = f"http://{IP}/api/" key_identity = "IJYkHAmT6fc928IYhaXUoiOOenudMayZ" key_credential = "MO9aXJE9TdbrlMEfyq7H4DyFLoAZ2EmD" new_set: dict = create_new_set( "RickyRenuncia Tweets", url_api, key_identity=key_identity, key_credential=key_credential ) set_id = new_set["o:id"] counter = 0 with open("rrhydrated.jsonl", "r") as jlFile: for json_str in jlFile: try: create_tweet_omeka(json.loads(json_str), set_id, url_api, key_identity, key_credential) except: continue counter += 1 print(counter) if counter > 300: break main()