Fine-tuning BERT for applications of Sentiment analysis

This project was a part of undergraduate capstone project.

Situation

Addressing the need for sentiment analysis in the context of the hotel and restaurant industry to understand customer opinions and enhance service quality. The impact of sentiment on stock market performance and the need for effective analysis of news and reviews to guide investment decisions.

Task

Action

def extract_tweets(q):
    client = tweepy.Client(bearer_token=bearer_token)
    twt = client.search_recent_tweets(query=q, max_results=100)
    t = twt[0]
    for i in range(len(t)):
        t[i] = str(t[i])
    return t
def model(txt):
    finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
    result = nlp(txt)
    print(result)
    return result
def clean(twt):
    for i in range(len(twt)):
        twt[i] = twt[i].lower()
        twt[i] = twt[i].translate(translate_table)
        twt[i] = re.sub('[0-9]+', '', twt[i])
        return twt
def model1(text):

    if  not os.path.isfile('model_weights.index'):
        df=pd.read_csv("static/ratings40.csv",encoding = 'latin-1')
        df['txt']=df.Reviews
        df['target']=df.Ratings
        df=df.drop(['Ratings','Reviews'],axis=1)
        model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        df=df.dropna()
        x_train, x_test, y_train, y_test = train_test_split(df['txt'], df['target'], test_size=0.2)
        train=pd.concat([x_train,y_train],axis=1)
        test=pd.concat([x_test,y_test],axis=1)
        InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)
        train_InputExamples, validation_InputExamples = convert_data_to_examples(train,test,
                                                                           'txt',
                                                                           'target')
        DATA_COLUMN = 'txt'
        LABEL_COLUMN = 'target'
        train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)
        train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
        train_data = train_data.shuffle(100).batch(32).repeat(2)
        validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
        validation_data = validation_data.batch(32)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
        model.fit(train_data, epochs=2, validation_data=validation_data)
        os.listdir('savedmodel')
        model.save_weights('savedmodel')
        model
        pred_sentences = [text]
        tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
        tf_outputs = model(tf_batch)
        tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
        labels = ['Negative', 'Positive']
        label = tf.argmax(tf_predictions, axis=1)
        label = label.numpy()
        res = {}
        for i in range(len(pred_sentences)):
            print(pred_sentences[i], ": \\n", labels[label[i]])
            res[pred_sentences[i]] = labels[label[i]]
        return res
    else:
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        pred_sentences = [text]
        tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
        model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
        model.load_weights('savedmodel')
        tf_outputs = model(tf_batch)
        tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
        labels = ['Negative', 'Positive']
        label = tf.argmax(tf_predictions, axis=1)
        label = label.numpy()
        res = {}
        for i in range(len(pred_sentences)):
            print(pred_sentences[i], ": \\n", labels[label[i]])
            res["res"] = labels[label[i]]
        return res

Tools: Django was used for back-end and HTML/CSS for the front-end. Fine tuned BERT model from HuggingFace to analyze sentiment analysis of tweets and reviews

Result