xxxxxxxxxx
383
let descriptionPOS = "POS (Part of Speech) Tags are used to determin the grammatical function of a word. These datapoints are sorted in 9 categories: noun, verb, adjective, adverb, conjunction or preposition, determiner or pre-determiner, personal pronoun, possessive pronoun and other. A word can be used as more than one of those categories but is sorted by the most frequent POS tag in my dataset."
let descriptionTED = "Tags are assigned to a TED talk to show its topics. To categorize a word by TED Tags, its Word Embedding vector is compared to the vectors of 277 unique TED tags to search for the most similar."
let descriptionData = "This visualization is part of a bachelor project by Anna-Lena Keith at FH Aachen 2023. The dataset consists of a 100 dimensional Word Embedding trained on TED Talk transcripts. Thanks to Nils Freyer for training the machine learning algorithm!"
let infoP, infoProject, infoPos, infoTed;
let information, vectorsLoad, vectors = [];
let exampleSentences;
let posTags, posCategory;
let img_noun, img_verb, img_adv, img_adj, img_con, img_det, img_personal, img_possessive, img_other;
let TEDtagLabels = []; let TEDtagVectors = [];
let TEDtagLabelLoad; let TEDtagVectorsLoad;
let colorArray = [];
let extraImg, HLabel, PFrequency, PexampleSentence, posInfoDiv;
let descriptionDiv;
let boxWidth = 20; let columns;
let POSsymbols = [];
let y = 0; let count = 0;
let markPositions = [];
let markText = ["10000", "5000", "1000", "500", "200", "100", "60"];
let symbols = true;
let sorted = true;
function preload() {
information = loadTable('https://raw.githubusercontent.com/alenakeith/embeddingData/main/information.tsv', 'tsv', 'header');
vectorsLoad = loadTable('https://raw.githubusercontent.com/alenakeith/embeddingData/main/vectors.tsv', 'tsv');
//exampleSentences = loadTable('https://raw.githubusercontent.com/alenakeith/embeddingData/main/exampleSentences.csv', 'csv', 'header');
posTags = loadTable('https://raw.githubusercontent.com/alenakeith/embeddingData/main/posTags.tsv', 'tsv', 'header', (table) => {
posCategory = table.getColumn('category');
posTags = table.getRows();
});
if (sorted) {
TEDtaglabelLoad = loadTable('https://raw.githubusercontent.com/alenakeith/embeddingData/main/TEDtagLabelsSorted.tsv', 'tsv', (table) => {
let t = table.getRows();
for(let row of t) {
TEDtagLabels.push(row.arr[0]);
}
});
TEDtagVectorsLoad = loadTable('https://raw.githubusercontent.com/alenakeith/embeddingData/main/TEDtagVectorsSorted.tsv', 'tsv', (table) => {
let t = table.getRows();
for(let row of t) {
let v = [];
for (let num of row.arr) {
v.push(parseFloat(num));
}
TEDtagVectors.push(v);
}
});
}
img_noun = loadImage('_pos_noun.svg');
img_verb = loadImage('_pos_verb.svg');
img_adv = loadImage('_pos_adverb.svg');
img_adj = loadImage('_pos_adjective.svg');
img_con = loadImage('_pos_conjunction.svg');
img_det = loadImage('_pos_determiner.svg');
img_personal = loadImage('_pos_personal pronoun.svg');
img_possessive = loadImage('_pos_possessive pronoun.svg');
img_other = loadImage('_pos_other.svg');
}
function dataLoad() {
frequency = information.getColumn('occurence');
frequency = frequency.map(el => JSON.parse(el));
information = information.getRows();
vectorsLoad = vectorsLoad.getRows();
for (let row of vectorsLoad) {
let rowArray = row.arr
rowArray.splice(0, 1);
for (let i = 0; i < rowArray.length; i++) {
rowArray[i] = parseFloat(rowArray[i]);
}
vectors.push(rowArray);
}
//exampleSentences = exampleSentences.getColumn('sentence');
}
function htmlElements() {
//extraImg = createImg('pos_other.png');
extraImg = createImg('_pos_other.svg');
//extraImg.style('fill', 'hsl(20, 90%, 90%)');
extraImg.style('width','60px');
HLabel = createElement('h3', 'label');
PFrequency = createP('information');
//PexampleSentence = createDiv('example');
//PexampleSentence.id('sentence');
posInfoDiv = createDiv('');
posInfoDiv.class('posInfo');
extraImg.parent(posInfoDiv);
HLabel.parent(posInfoDiv);
PFrequency.parent(posInfoDiv);
//PexampleSentence.parent(posInfoDiv);
posInfoDiv.position(width, 40, 'fixed');
infoP = createP("");
infoProject = createP("about project and dataset");
infoProject.mouseOver(() => {
infoP.html(descriptionData);
})
infoPos = createP("about POS tags");
infoPos.mouseOver(() => {
infoP.html(descriptionPOS);
})
infoTed = createP("about TED categories");
infoTed.mouseOver(() => {
infoP.html(descriptionTED);
})
descriptionDiv = createDiv("");
descriptionDiv.position(width, 400, 'fixed');
descriptionDiv.class('description');
infoProject.parent(descriptionDiv);
infoProject.class('sentence');
infoPos.parent(descriptionDiv);
infoPos.class('sentence');
infoTed.parent(descriptionDiv);
infoTed.class('sentence');
infoP.parent(descriptionDiv);
infoP.class('sentence');
}
function setup() {
frameRate(20);
ellipseMode(CORNER);
imageMode(CORNER);
colorMode(HSB, 360, 100, 100, 100)
dataLoad();
let w = windowWidth*0.79;
columns = round(w/boxWidth)-3;
let lines = round(7507/columns) + 1;
let height = boxWidth*lines;
canvas = createCanvas(w, height);
htmlElements();
objectsAndMarks();
if (sorted) {
let colorstep = 360/TEDtagLabels.length;
for (let i = 0; i < TEDtagLabels.length; i++) {
colorArray[i] = i * colorstep;
}
}
}
let it = 0;
function draw() {
//background(255);
noStroke();
for (let p of POSsymbols) {
//if (sorted) {p.displayTED()}
p.onMouseHover();
if (it == 0) {p.display()}
}
for (let i = 0; i < markPositions.length; i++) {
stroke('#111952');
line(width-boxWidth*1.8, markPositions[i], width-5, markPositions[i]);
noStroke(); fill('#111952');
textSize(9);
text(markText[i], width-boxWidth*1.8, markPositions[i]-4)
}
it++
if (it > 1000) {
it = 0;
}
}
function objectsAndMarks() {
let tenthousandmark = false;
let fivethousandmark = false;
let thousandmark = false;
let fivehundredmark = false;
let twohundredmark = false;
let hundredmark = false;
let sixtymark = false;
for (let i = 0; i < information.length; i++) {
let x = count * boxWidth
POSsymbols[i] = new POS(x, y, i);
count++
if (count > columns) {
count = 0;
y += boxWidth;
}
if (frequency[i] <= 10000 && !tenthousandmark) {
markPositions.push(y+boxWidth);
tenthousandmark = true;
} if (frequency[i] <= 5000 && !fivethousandmark) {
markPositions.push(y+boxWidth);
fivethousandmark = true;
} else if (frequency[i] <= 1000 && !thousandmark) {
markPositions.push(y+boxWidth);
thousandmark = true;
} else if (frequency[i] <= 500 && !fivehundredmark) {
markPositions.push(y+boxWidth);
fivehundredmark = true;
} else if (frequency[i] <= 200 && !twohundredmark) {
markPositions.push(y+boxWidth);
twohundredmark = true;
} else if (frequency[i] <= 100 && !hundredmark) {
markPositions.push(y+boxWidth);
hundredmark = true;
} else if (frequency[i] <= 60 && !sixtymark) {
markPositions.push(y+boxWidth);
sixtymark = true;
}
}
}
class POS {
//constructor(name, firstPOS, x, y, frequency, sentence) {
constructor(x, y, index) {
this.name = information[index].arr[1];
this.Pos = information[index].arr[2];
this.v = vectors[index];
this.x = x;
this.y = y;
this.frequency = frequency[index];
this.radius = boxWidth/2;
this.mouseHover = false;
this.grammar = this.checkGrammar();
//console.log(this.grammar);
this.img = this.selectImg();
//this.sentence = exampleSentences[index];
this.imgSource = this.chooseImgSource();
if (sorted) {
this.TEDtagInd = this.TED();
}
}
display() {
//tint(0, 95);
//tint(colorArray[this.TEDtagInd], 50, 70);
tint(colorArray[this.TEDtagInd], 60, 90);
image(this.img, this.x, this.y, boxWidth, boxWidth);
}
displayTED() {
fill(colorArray[this.TEDtagInd], 60, 100, 20)
rect(this.x, this.y, boxWidth, boxWidth);
}
//is called once, when class object is initialized
checkGrammar() {
for (let i = 0; i < posTags.length; i++) {
if (this.Pos == posTags[i].arr[1]) {
return(posCategory[i]);
//break;
}
}
}
chooseImgSource() {
if (this.grammar == "noun") {
return('pos_noun.png')
} else if(this.grammar == "personal pronoun") {
return('pos_personal pronoun.png')
} else if(this.grammar == "possessive pronoun") {
return('pos_possessive pronoun.png')
} else if(this.grammar == "adjective") {
return('pos_adjective.png')
} else if(this.grammar == "adverb") {
return('pos_adverb.png')
} else if(this.grammar == "verb") {
return('pos_verb.png')
} else if(this.grammar == "determiner or pre-determiner") {
return('pos_determiner.png')
} else if (this.grammar == "conjunction or preposition"){
return('pos_conjunction.png')
} else {
return('pos_other.png')
}
}
selectImg() {
if (this.grammar == "noun") {
return(img_noun)
} else if(this.grammar == "personal pronoun") {
return(img_personal)
} else if(this.grammar == "possessive pronoun") {
return(img_possessive)
} else if(this.grammar == "adjective") {
return(img_adj)
} else if(this.grammar == "adverb") {
return(img_adv)
} else if(this.grammar == "verb") {
return(img_verb)
} else if(this.grammar == "determiner or pre-determiner") {
return(img_det)
} else if (this.grammar == "conjunction or preposition"){
return(img_con)
} else {
return(img_other)
}
}
onMouseHover() {
let d = dist(this.x + this.radius, this.y + this.radius, mouseX, mouseY); //gives information on mouse hover
if (d < this.radius) {
HLabel.html(this.name);
let textColor = 'hsl(' + colorArray[this.TEDtagInd] + ', 80%, 60%)';
HLabel.style('color', textColor);
let p = "Frequency: " + this.frequency + ", most frequently used as: " + this.grammar;
if(sorted) {
p = "Frequency in dataset: " + this.frequency + ",</p>most frequently used as: " + this.grammar + ",</p>calculated TED category: " + TEDtagLabels[this.TEDtagInd];
}
PFrequency.html(p);
//PexampleSentence.html(this.sentence);
this.mouseHover = true;
//extraImg.style('fill', textColor);
extraImg.attribute('src', this.imgSource);
} else {
this.mouseHover = false;
}
/*if (this.mouseHover) {
fill(this.TEDtagInd, 20, 100); noStroke();
rect(this.x, this.y, boxWidth, boxWidth);
} else {
fill(255); noStroke();
}*/
//rect(this.x, this.y, boxWidth, boxWidth);
}
writePOS() {
let string = this.Pos;
let x = this.x - textWidth(string)/2;
text(this.Pos, x, this.y)
rect(this.x, this.y, 10);
}
TED() {
let sim = 0; let ind = 0;
let i = 0;
for (let t of TEDtagVectors) {
let currentSim = calculateSimilarity(t, this.v);
if (currentSim > sim) {
sim = currentSim;
ind = i;
}
i++
}
console.log(this.name + ", " + TEDtagLabels[ind]);
return(ind);
}
}
function calculateSimilarity(vA, vB) {
let dotProduct = 0;
let vAbetrag = 0;
let vBbetrag = 0;
for (let i = 0; i < 100; i++) {
dotProduct += vA[i] * vB[i];
vAbetrag += vA[i] * vA[i];
vBbetrag += vB[i] * vB[i];
}
vAbetrag = sqrt(vAbetrag);
vBbetrag = sqrt(vBbetrag);
let similarity = dotProduct/ (vAbetrag*vBbetrag);
return(similarity);
}