@article {10.3844/jcssp.2024.819.826,
article_type = {journal},
title = {Prompt-Based Data Augmentation with Large Language Models for Indonesian Gender-Based Hate Speech Detection},
author = {Ibrahim, Muhammad Amien and Faisal, and Sulistiya, Zefanya Delvin and Winarto, Tora Sangputra Yopie},
volume = {20},
number = {8},
year = {2024},
month = {May},
pages = {819-826},
doi = {10.3844/jcssp.2024.819.826},
url = {https://thescipub.com/abstract/jcssp.2024.819.826},
abstract = {The increasing amount of content on social media content makes the use of automatic moderation crucial for preserving a healthy online community and reducing the spread of offensive and abusive content, such as hate speech based on gender. Developing automated social media moderation using machine learning demands a large and balanced dataset. However, difficulties such as data scarcity and class imbalance have hindered the development of gender-based hate speech detection on Indonesian Twitter communities. Creating and annotating a new dataset would be time-consuming and costly. One practical alternative is to use data augmentation methods to help address the minority class imbalance in datasets. This study investigates how prompt-based data augmentation may be used with a large language model to provide organic tweet samples for gender-based hate speech detection. Furthermore, the study investigates the preservation of labels in augmented Twitter samples. In comparison to the benchmark back translation approach, the results show that prompt-based data augmentation using a large language model may generate new and organic Twitter samples while keeping labels preserved and avoiding memorization. In conventional machine learning models, prompt-based data augmentation with a large language model shows competitive performance compared to back translation in terms of accuracy metrics. According to these results, using prompting for data augmentation on large language models is an alternative strategy that can provide new, less memorization tweet samples that maintain label integrity while achieving competitive accuracy results.},
journal = {Journal of Computer Science},
publisher = {Science Publications}
}