This module objective is to explore the usage of Schema.org, specifically the Social Media Posting and the Creative Work metadata template, to crosswalk the #RickyRenuncia Twitter dataset from a jsonl to an RDF/JSON-LD structure.
The #RickyRenuncia Twitter dataset can be thought as an unstructured data. By using Schema.org metadata templates, a dataset can be structured preserving data relations that can enhace findability, interoperability, accesibility and reusability (FAIR) of this dataset for research purposes.
Schema.org was founded by Google, Microsoft, Yahoo and Yandex, with the purpose of developing a standard vocabulary that allows different and/or seach engines use data from multiple sources.
The metadata vocabularies used in this project are extracted from Schema.org. By complementing the Social Media Posting and the Creative Work template the project was able to successfully create relations between Original Tweets, Retweets, Quoted Retweets and its shared media.
An important metadata field used to allow this relation is the isBasedOn from Schema.org. The isBasedOn field is used to represent "A resource from which this work is derived or from which it is a modification or adaption" (from Schema.org Website). It accepts as a digital object a URL or an Item derived from a Creative Work template.
The Resource templates created for this project attempt to preserve relevant information of the Tweet object.
For this project, the relevant information is:
This project did not work on making explicit the relations between users, nor keep information relates to the amounts of retweets of likes. It has to be mention that the Social Media Posting template offers metadata fields for those types of information.
The types of entities in this collection are the author, tweet, media, and retweeted or quotes tweets.
Items are related to each other in multiple ways, such as:
If the item is a quoted tweet. -----> This relation takes form by using the isBasedOn metadata field. If the item is a retweet? -----> This relation takes form by using the isBasedOn metadata field. If both quoted tweet and retweet contains media? -----> This relation takes form by using the isBasedOn and sharedContent metadata field. By author -----> This relation takes form by using an Omeka Resource, which was needed to create an resource template.
import requests
from typing import List, Union
from xml.etree import ElementTree as ET
import re
import json
from tweet_rehydrate.analysis import TweetAnalyzer, TweetMedia
IP = "3.235.43.90"
def download_media(media: TweetMedia, filename: str):
response = requests.get(media.url())
if response.status_code == 200:
with open(filename, "wb") as tmpfile:
for chunk in response.iter_content():
if chunk:
tmpfile.write(chunk)
return filename
return None
def create_tweet_omeka(tweet: Union[dict, TweetAnalyzer], item_set: int, url_api, key_identity, key_credential):
if type(tweet) is not TweetAnalyzer:
tweet = TweetAnalyzer(tweet)
tweet._hasLocalMedia()
results = seach_tweet_omeka(tweet, item_set, url_api)
if len(results) == 1:
return results[0]
else:
author = create_author_omeka(
tweet.user_id, tweet.user_screen_name, item_set, url_api, key_identity, key_credential)
headers = {'Content-type': 'application/json'}
params = {
"key_identity": key_identity,
"key_credential": key_credential
}
all_media = []
all_media_fn = []
all_media_ft = []
all_media_index =[]
media_count = 0
for media in tweet.media:
tmp_name = f"tmps/file{media_count}"
save_name = download_media(media, tmp_name)
if save_name:
all_media.append({
"o:ingester": "upload",
"file_index": media_count,
"o:item": {},
"schema:identifier": [
{
"type": "literal",
"property_id": 1623,
"property_label": "identifier",
# "is_public": true,
"@value": media.url()
}
]
"dcterms:title": [{
"property_id": 1,
"property_label": "Title",
"@value": media.title(),
"type": "literal"}]
})
all_media_index.append(media_count)
all_media_fn.append(save_name)
all_media_ft.append(media.mtype())
media_count+=1
data = {
"@type": [
"o:Item",
"schema:SocialMediaPosting"
],
"o:item_set": [{"o:id": item_set}],
"o:resource_class": {
"@id": f"http://{IP}/api/resource_classes/1460",
"o:id": 1460
},
"o:resource_template": {
"@id": f"http://{IP}/api/resource_templates/8",
"o:id": 8
},
# "schema:isBasedOn": isBasedOn,
"schema:identifier": [
{
"type": "literal",
"property_id": 1623,
"property_label": "identifier",
# "is_public": true,
"@value": tweet.urlByIDs()
}
],
"schema:url": [
{
"type": "literal",
"property_id": 1763,
"property_label": "url",
# "is_public": true,
"@value": tweet.url()
}
],
"schema:dateCreated": [
{
"type": "literal",
"property_id": 2520,
"property_label": "dateCreated",
# "is_public": true,
"@value": tweet.data["created_at"]
}
],
"schema:author": [
{
"type": "resource",
"@id": author["@id"],
"property_id": 1616,
"property_label": "author",
"value_resource_id": author["o:id"],
"value_resource_name": "items",
}
],
"schema:text": [
{
"type": "literal",
"property_id": 2618,
"property_label": "text",
# "is_public": true,
"@value": tweet.data["full_text"]
}
],
"schema:sharedContent": all_media,
"schema:mentions": [
{
"type": "literal",
"property_id": 2089,
"property_label": "mentions",
# "is_public": true,
"@value": uMention["screen_name"]
}
for uMention in tweet.data["entities"]["user_mentions"]
],
"schema:keywords": [
{
"type": "literal",
"property_id": 2329,
"property_label": "keywords",
# "is_public": true,
"@value": hashtag["text"]
} for hashtag in tweet.data["entities"]["hashtags"]
],
"o:media": all_media,
}
if tweet.isRetweet:
retweeted = create_tweet_omeka(
tweet.data["retweeted_status"],
item_set, url_api, key_identity, key_credential
)
isBasedOn = {
"type": "resource",
"@id": retweeted["@id"],
"property_id": 2104,
"property_label": "isBasedOn",
"value_resource_id": retweeted["o:id"],
"value_resource_name": "items"
}
data["schema:isBasedOn"] = [isBasedOn, ]
if tweet.isQuote:
quoted = create_tweet_omeka(
tweet.data["quoted_status"], item_set, url_api, key_identity, key_credential)
isBasedOn = {
"type": "resource",
"@id": quoted["@id"],
"property_id": 2104,
"property_label": "isBasedOn",
"value_resource_id": quoted["o:id"],
"value_resource_name": "items",
}
data["schema:isBasedOn"] = [isBasedOn, ]
Files = [
('data', (None, json.dumps(data), 'application/json')),
# ('file[0]', (title, open("tmp_file.tmp", "rb"), 'image/jpg'))
]
for idx in all_media_index:
tmp_tuple = (f'file[{idx}]', (all_media_fn[idx].split("/")[-1], open(all_media_fn[idx],"rb"), all_media_ft[idx]) )
Files.append(tmp_tuple)
response = requests.post(
url=url_api+"items/",
# data=json.dumps(data),
params=params,
# headers=headers,
files=Files)
if response.status_code == 200:
# print(response.text)
return json.loads(response.text)
else:
print("Failed to Create Tweet")
return None
def seach_tweet_omeka(tweet: Union[dict, TweetAnalyzer], item_set: int, url_api):
if type(tweet) is not TweetAnalyzer:
tweet = TweetAnalyzer(tweet)
query = {
"item_set": item_set,
"property": [
{
"property": 1623, # Schema:identifier= URL to tweet user account
"type": "eq",
"text": tweet.urlByIDs()
}
]
}
response = requests.get(url=url_api+"items/", data=query)
if response.status_code == 200:
return json.loads(response.text)
return None
def create_author_omeka(ID, screen_name, item_set, url_api, key_identity, key_credential):
results = search_author_omeka(ID, screen_name, item_set, url_api)
if len(results) == 1:
return results[0]
else:
headers = {'Content-type': 'application/json'}
params = {
"key_identity": key_identity,
"key_credential": key_credential
}
data = {
"@type": [
"o:Item",
"schema:Person"
],
"o:item_set": [{"o:id": item_set}],
"o:resource_class": {
"@id": "http://{IP}/api/resource_classes/1689",
"o:id": 1689
},
"o:resource_template": {
"@id": "http://{IP}/api/resource_templates/7",
"o:id": 7
},
# "schema:identifier": f"https://twitter.com/{ID}",
# "schema:alternateName": screen_name,
"schema:identifier": [
{
"type": "literal",
"property_id": 1623,
"property_label": "identifier",
# "is_public": true,
"@value": f"https://twitter.com/{screen_name}"
},
{
"type": "literal",
"property_id": 1623,
"property_label": "identifier",
# "is_public": true,
"@value": ID
}
],
"schema:alternateName": [
{
"type": "literal",
"property_id": 2700,
"property_label": "alternateName",
# "is_public": true,
"@value": screen_name
}
]
}
response = requests.post(
url=url_api+"items/", data=json.dumps(data), params=params, headers=headers)
if response.status_code == 200:
# print(response.text)
return json.loads(response.text)
else:
print("Failed to Create Author")
return None
def search_author_omeka(ID, screen_name, item_set, url_api):
# http://3.237.93.184/api/items?item_set_id=3433&property[0][property]=2618&property[0][type]=eq&property[0][text]=popo
# http://3.237.93.184/api/items?item_set_id=3433&property[0][property]=1623&property[0][type]=eq&property[0][text]=caca
query = {
"item_set": item_set,
"property": [
{
"property": 2700, # Schema:alternateName= Tweet Screen Name
"type": "eq",
"text": screen_name
},
{
"property": 1623, # Schema:identifier= URL to tweet user account
"type": "eq",
"text": f"https://twitter.com/{screen_name}"
}
]
}
response = requests.get(url=url_api+"items/", data=query)
if response.status_code == 200:
return json.loads(response.text)
return None
def create_media_omeka(URL: str, item_set, url_api, key_identity, key_credential):
results = search_media_omeka(URL, item_set, url_api)
if len(results) == 1:
return results[0]
else:
get_media = requests.get(URL)
if get_media.status_code != 200:
return None
# file_binary=bytearray()
with open("tmp_file.tmp", "wb") as tmp_file:
for chunk in get_media.iter_content():
if chunk:
tmp_file.write(chunk)
# file_binary=bytes(file_binary)
# file_binary = get_media.i
title = URL.split("/")[-1].split("?")[0]
headers = {'Content-type': 'application/json'}
params = {
"key_identity": key_identity,
"key_credential": key_credential
}
data = {
"@type": [
"o:Media",
"schema:MediaObject"
],
"o:item_set": [{"o:id": item_set}],
"o:resource_class": {
"@id": "http:\/\/3.235.43.90\/api\/resource_classes\/1821",
"o:id": 1821
},
"o:resource_template": {
"@id": "http:\/\/3.235.43.90\/api\/resource_templates\/11",
"o:id": 11
},
"schema:identifier": [
{
"type": "literal",
"property_id": 1623,
"property_label": "identifier",
# "is_public": true,
"@value": URL
},
],
"o:ingester": "upload",
"file_index": "0",
"o:media": [
{
"o:ingester": "upload",
"file_index": "0",
"o:item": {},
"dcterms:title": [
{
"property_id": 1,
"property_label": "Title",
"@value": "My media upload title",
"type": "literal"
}
]
}
],
}
print(title)
files = [
('data', (None, json.dumps(data), 'application/json')),
('file[0]', (title, open("tmp_file.tmp", "rb"), 'image/jpg'))
]
response = requests.post(url=url_api+"media/",
params=params, headers=headers, files=files)
if response.status_code == 200:
print(response.text)
return json.loads(response.text)
else:
print("Failed to Create Media")
print(response.text)
return None
def search_media_omeka(URL, item_set, url_api):
query = {
"item_set": item_set,
"property": [
{
"property": 1623, # Schema:identifier= URL to tweet user account
"type": "eq",
"text": URL
}
]
}
response = requests.get(url=url_api+"media/", data=query)
if response.status_code == 200:
return json.loads(response.text)
return None
# http://3.237.93.184/api/items?item_set_id=3433&property[0][property]=2618&property[0][type]=eq&property[0][text]=popo
def create_new_set(set_name: str, url_api: str, key_identity, key_credential):
# Creates a new Omeka Item set and returns the sets metadata
headers = {'Content-type': 'application/json'}
data = {
"dcterms:title": [
{
"type": "literal",
"property_label": "Title",
"@value": set_name,
"property_id": 1
}
]
}
params = {
"key_identity": key_identity,
"key_credential": key_credential
}
item_sets_api_url = url_api + "item_sets"
response = requests.post(
item_sets_api_url,
params=params,
data=json.dumps(data),
headers=headers
)
print(response.status_code)
if response.status_code == 200:
print(response.text)
return json.loads(response.text)
return None
def main():
# Main execution loop.
url_api = f"http://{IP}/api/"
key_identity = "IJYkHAmT6fc928IYhaXUoiOOenudMayZ"
key_credential = "MO9aXJE9TdbrlMEfyq7H4DyFLoAZ2EmD"
new_set: dict = create_new_set(
"RickyRenuncia Tweets",
url_api,
key_identity=key_identity,
key_credential=key_credential
)
set_id = new_set["o:id"]
counter = 0
with open("rrhydrated.jsonl", "r") as jlFile:
for json_str in jlFile:
try:
create_tweet_omeka(json.loads(json_str), set_id,
url_api, key_identity, key_credential)
except:
continue
counter += 1
print(counter)
if counter > 300:
break
main()