This project was a part of undergraduate capstone project.
Addressing the need for sentiment analysis in the context of the hotel and restaurant industry to understand customer opinions and enhance service quality. The impact of sentiment on stock market performance and the need for effective analysis of news and reviews to guide investment decisions.
def extract_tweets(q):
client = tweepy.Client(bearer_token=bearer_token)
twt = client.search_recent_tweets(query=q, max_results=100)
t = twt[0]
for i in range(len(t)):
t[i] = str(t[i])
return t
def model(txt):
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
result = nlp(txt)
print(result)
return result
def clean(twt):
for i in range(len(twt)):
twt[i] = twt[i].lower()
twt[i] = twt[i].translate(translate_table)
twt[i] = re.sub('[0-9]+', '', twt[i])
return twt
def model1(text):
if not os.path.isfile('model_weights.index'):
df=pd.read_csv("static/ratings40.csv",encoding = 'latin-1')
df['txt']=df.Reviews
df['target']=df.Ratings
df=df.drop(['Ratings','Reviews'],axis=1)
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
df=df.dropna()
x_train, x_test, y_train, y_test = train_test_split(df['txt'], df['target'], test_size=0.2)
train=pd.concat([x_train,y_train],axis=1)
test=pd.concat([x_test,y_test],axis=1)
InputExample(guid=None,
text_a = "Hello, world",
text_b = None,
label = 1)
train_InputExamples, validation_InputExamples = convert_data_to_examples(train,test,
'txt',
'target')
DATA_COLUMN = 'txt'
LABEL_COLUMN = 'target'
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
model.fit(train_data, epochs=2, validation_data=validation_data)
os.listdir('savedmodel')
model.save_weights('savedmodel')
model
pred_sentences = [text]
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative', 'Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
res = {}
for i in range(len(pred_sentences)):
print(pred_sentences[i], ": \\n", labels[label[i]])
res[pred_sentences[i]] = labels[label[i]]
return res
else:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
pred_sentences = [text]
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
model.load_weights('savedmodel')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative', 'Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
res = {}
for i in range(len(pred_sentences)):
print(pred_sentences[i], ": \\n", labels[label[i]])
res["res"] = labels[label[i]]
return res
Tools: Django was used for back-end and HTML/CSS for the front-end. Fine tuned BERT model from HuggingFace to analyze sentiment analysis of tweets and reviews