{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demo: keyword-based sentiment classifier\n", "[UMass CS 490A](https://people.cs.umass.edu/~brenocon/cs490a_f20/), 9/10/2020.\n", "Attempt to replicate [Pang et al. 2002](https://www.aclweb.org/anthology/W02-1011/)'s \"Human 1, Human 2\" experiment.\n", "\n", "Data is the \"1.0\" version from here, which I think was used in the original paper: http://www.cs.cornell.edu/people/pabo/movie-review-data/\n", "\n", "Spreadsheet of keyword submissions:\n", "https://docs.google.com/spreadsheets/d/1sdYr-iHEe6ZQADWCK0XbRTIq5OlpIoRhXqTPnHnZSNQ/edit?usp=sharing" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import glob\n", "import numpy as np\n", "\n", "def load_file(filename):\n", " return open(filename, errors='ignore').read().split()\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# Everything is just the test set, since we have no machine learning\n", "true_labels = []\n", "docs = []\n", "for f in glob.glob(\"tokens/neg/*.txt\"):\n", " docs.append(load_file(f))\n", " true_labels.append('neg')\n", "for f in glob.glob(\"tokens/pos/*.txt\"):\n", " docs.append(load_file(f))\n", " true_labels.append('pos')" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "# The keyword-based classifier\n", "POS_KEYWORDS = set(\"\"\"\n", "Amazing\n", "Awesome\n", "Best\n", "Cool\n", "Dazzling\n", "Fantastic\n", "Good\n", "Great\n", "Inspirational\n", "Intense\n", "Phenomenal\n", "Superb\n", "Thunderstruck\n", "Unbelievable\n", "Worth\n", "Wow\n", "absorbing\n", "accurate\n", "amazing\n", "art\n", "artist\n", "awesome\n", "beautiful\n", "best\n", "blowing\n", "breaking\n", "brilliant\n", "captivating\n", "capturing\n", "changing\n", "chemistry\n", "choreographed\n", "cool\n", "creative\n", "deep\n", "depth\n", "emotional\n", "energetic\n", "engaging\n", "enjoyable\n", "enjoyed\n", "enlightening\n", "entertaining\n", "enticing\n", "epic\n", "excellent\n", "exciting\n", "extraordinary\n", "fabulous\n", "fantastic\n", "fascinating\n", "feel\n", "filmed\n", "fresh\n", "fun\n", "funny\n", "genius\n", "good\n", "great\n", "gripping\n", "ground\n", "groundbreaking\n", "happy\n", "harmonious\n", "healing\n", "heart\n", "hilarious\n", "immersed\n", "immersive\n", "impactful\n", "impressive\n", "insightful\n", "inspiring\n", "intense\n", "interesting\n", "intriguing\n", "justified\n", "laugh\n", "life\n", "likable\n", "like\n", "lol\n", "love\n", "loved\n", "lovely\n", "magnificent\n", "masterfully\n", "memorizing\n", "mind\n", "moving\n", "must\n", "nevertheless\n", "nice\n", "nominated\n", "nostalgia\n", "oscar\n", "outstanding\n", "perfect\n", "phenomenal\n", "picking\n", "pog\n", "poggers\n", "poignant\n", "popcorn\n", "powerful\n", "professional\n", "provoking\n", "refreshing\n", "revolutionary\n", "rewatch\n", "riveting\n", "satisfying\n", "saucy\n", "saved\n", "scintillating\n", "strong\n", "stunning\n", "super\n", "terrific\n", "thought\n", "thoughtful\n", "thriller\n", "thrilling\n", "touching\n", "transcendent\n", "unseen\n", "watch\n", "watchable\n", "well\n", "wonderful\n", "wow\n", "wrenching\"\"\".split())\n", "\n", "\n", "\n", "NEG_KEYWORDS = set(\"\"\"\n", "Awful\n", "Bad\n", "Confusing\n", "Crap\n", "Crappy\n", "Horrible\n", "Horrific\n", "It\n", "No\n", "Suck\n", "Terrible\n", "Trash\n", "Worst\n", "accurate\n", "amateur\n", "annoying\n", "anxious\n", "appalling\n", "asleep\n", "atrocious\n", "average\n", "awful\n", "bad\n", "basic\n", "big\n", "boring\n", "careless\n", "cliche\n", "cliched\n", "cliches\n", "contrived\n", "cookie\n", "cringe\n", "cringeworthy\n", "cutter\n", "damned\n", "desolate\n", "disappointed\n", "disappointing\n", "disgusting\n", "disorganized\n", "distasteful\n", "disturbing\n", "drab\n", "dragged\n", "dreadful\n", "dreary\n", "dull\n", "egregious\n", "enraged\n", "evil\n", "fake\n", "feeble\n", "forgettable\n", "garbage\n", "good\n", "gross\n", "hate\n", "hated\n", "headache\n", "horrendous\n", "horrible\n", "horrid\n", "however\n", "insulting\n", "lackluster\n", "lame\n", "lazy\n", "lifeless\n", "long\n", "lousy\n", "mediocre\n", "negative\n", "no\n", "nonsensical\n", "not\n", "nothing\n", "offensive\n", "old\n", "passe\n", "pathetic\n", "plain\n", "plot\n", "poor\n", "poorly\n", "predictable\n", "regretful\n", "rough\n", "sad\n", "sadly\n", "sh\n", "shallow\n", "shameful\n", "shit\n", "short\n", "simple\n", "skip\n", "sleepy\n", "slow\n", "spooky\n", "stupid\n", "suck\n", "tasteless\n", "terrible\n", "time\n", "timepass\n", "trash\n", "unattractive\n", "unbearable\n", "underwritten\n", "unfortunately\n", "unfunny\n", "uninspired\n", "uninteresting\n", "unoriginal\n", "unpalatable\n", "unrealistic\n", "unstructured\n", "untolerable\n", "unwatchable\n", "vapid\n", "waste\n", "weak\n", "worst\n", "worthy\n", "\"\"\".split())" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1400" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(docs)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1400" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(true_labels)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum([True,True,False])" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "def kw_classify(doc):\n", " num_pos = sum([ (w in POS_KEYWORDS) for w in doc ])\n", " num_neg = sum([ (w in NEG_KEYWORDS) for w in doc ])\n", "# print(num_pos, num_neg)\n", " return \"pos\" if num_pos > num_neg else \"neg\"" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'pos'" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kw_classify(docs[100])" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "# Make predictions (classifications) into parallel list\n", "preds = []\n", "for doc in docs:\n", " preds.append( kw_classify(doc) )" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Counter({'pos': 594, 'neg': 806})" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Counter(preds)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "num_correct = sum([ (preds[i] == true_labels[i])\n", " for i in range(len(preds)) ])" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "954" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "num_correct" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6814285714285714" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## ACCURACY RATE -- # Evaluate predictions vs. ground truth\n", "num_correct / len(preds)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }