| |
| !pip install transformers datasets |
|
|
| |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments |
| from datasets import load_dataset |
| import torch |
| import numpy as np |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT") |
| model = AutoModelForSequenceClassification.from_pretrained("medicalai/ClinicalBERT") |
|
|
| |
| ds = load_dataset("celikmus/symptom_text_to_disease_01") |
| train_dataset = ds['train'] |
| test_dataset = ds['test'] |
|
|
| |
| symptom_mapping = { |
| 0: "emotional pain", 1: "hair falling out", 2: "heart hurts", 3: "infected wound", |
| 4: "foot ache", 5: "shoulder pain", 6: "injury from sports", 7: "skin issue", |
| 8: "stomach ache", 9: "knee pain", 10: "joint pain", 11: "hard to breath", |
| 12: "head ache", 13: "body feels weak", 14: "feeling dizzy", 15: "back pain", |
| 16: "open wound", 17: "internal pain", 18: "blurry vision", 19: "acne", |
| 20: "muscle pain", 21: "neck pain", 22: "cough", 23: "ear ache", 24: "feeling cold", |
| } |
|
|
| |
| for entry in train_dataset: |
| entry['symptom_name'] = symptom_mapping[entry['labels']] |
|
|
| |
| print(f"Teks: {train_dataset[0]['text']}, Nama Gejala: {train_dataset[0]['symptom_name']}") |
|
|
| |
| def preprocess_function(examples): |
| return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256) |
|
|
| |
| train_dataset = train_dataset.map(preprocess_function, batched=True) |
| test_dataset = test_dataset.map(preprocess_function, batched=True) |
|
|
| |
| print(train_dataset.column_names) |
|
|
| |
| labels = train_dataset['labels'] |
| unique_labels = set(labels) |
| num_labels = len(unique_labels) |
|
|
| |
| model = AutoModelForSequenceClassification.from_pretrained("medicalai/ClinicalBERT", num_labels=num_labels) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir='./results', |
| evaluation_strategy='epoch', |
| learning_rate=2e-5, |
| per_device_train_batch_size=8, |
| per_device_eval_batch_size=8, |
| num_train_epochs=3, |
| weight_decay=0.01, |
| ) |
|
|
| |
| def compute_metrics(p): |
| predictions, labels = p |
| preds = np.argmax(predictions, axis=1) |
| precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') |
| acc = accuracy_score(labels, preds) |
| return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1} |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=test_dataset, |
| compute_metrics=compute_metrics |
| ) |
|
|
| |
| trainer.train() |
|
|
| |
| results = trainer.evaluate() |
| print(results) |