Spaces:
Sleeping
Sleeping
import logging | |
import yaml | |
import torch | |
import time | |
from torch import nn, optim | |
from torch.utils.data import DataLoader | |
from tqdm import tqdm | |
import os | |
import pandas as pd | |
from .classifier import Classifier | |
from .bert import Bert | |
from .dataset import TextDataset | |
def setup_logging(config): | |
logging.basicConfig( | |
filename=os.path.join(config['logging']['log_dir'], "log.log"), | |
filemode='w', | |
level=config['logging']['level'], | |
format=config['logging']['format'] | |
) | |
return logging.getLogger(__name__) | |
def evaluate(model, dataloader, criterion): | |
model.eval() | |
total_loss = 0.0 | |
correct = 0 | |
total = 0 | |
with torch.no_grad(): | |
for texts, labels in dataloader: | |
labels = labels.float().to(device) | |
outputs = model(texts).squeeze() | |
loss = criterion(outputs, labels) | |
total_loss += loss.item() * labels.size(0) | |
correct += ((outputs >= 0.5).float() == labels).sum().item() | |
total += labels.size(0) | |
return total_loss / total, correct / total | |
if __name__ == "__main__": | |
config = yaml.safe_load(open("config.yaml")) | |
logger = setup_logging(config) | |
logger.info("Starting training process") | |
logger.info(f"Configuration: {config}") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Инициализация модели | |
model = Classifier(Bert(config['model']['bert_name'])).to(device) | |
logger.info("Model initialized") | |
# Загрузка данных | |
train_dataset = torch.load(config['data']['train_path']) | |
test_dataset = torch.load(config['data']['test_path']) | |
train_loader = DataLoader( | |
train_dataset, | |
batch_size=int(config['data']['batch_size']), | |
shuffle=True | |
) | |
test_loader = DataLoader( | |
test_dataset, | |
batch_size=int(config['data']['batch_size']), | |
shuffle=False | |
) | |
logger.info(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}") | |
# Оптимизатор | |
optimizer = optim.Adam( | |
model.parameters(), | |
lr=float(config['training']['learning_rate']) | |
) | |
criterion = nn.BCELoss() | |
# Для записи результатов обучения | |
results = [] | |
for epoch in range(int(config['training']['epochs'])): | |
start_time = time.time() | |
# Обучение | |
model.train() | |
train_loss, train_correct = 0.0, 0 | |
for texts, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}", ncols=100): | |
labels = labels.float().to(device) | |
optimizer.zero_grad() | |
outputs = model(texts).squeeze() | |
loss = criterion(outputs, labels) | |
loss.backward() | |
optimizer.step() | |
train_loss += loss.item() * labels.size(0) | |
train_correct += ((outputs >= 0.5).float() == labels).sum().item() | |
# Оценка | |
train_loss /= len(train_dataset) | |
train_acc = train_correct / len(train_dataset) | |
test_loss, test_acc = evaluate(model, test_loader, criterion) | |
# Сохранение результатов | |
results.append({ | |
"epoch": epoch + 1, | |
"train_loss": train_loss, | |
"test_loss": test_loss, | |
"train_acc": train_acc, | |
"test_acc": test_acc | |
}) | |
# Логирование | |
epoch_time = time.time() - start_time | |
logger.info(f"Epoch {epoch+1} [{epoch_time:.1f}s]") | |
logger.info(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f}") | |
logger.info(f"Test Loss: {test_loss:.4f} | Acc: {test_acc:.4f}") | |
torch.save(model.state_dict(), | |
os.path.join(config['training']['save_dir'], f"model_{epoch+1}.pth")) | |
# Сохранение результатов в CSV | |
results_df = pd.DataFrame(results) | |
results_df.to_csv(os.path.join(config['logging']['log_dir'], "training_results.csv"), index=False) | |
# Финализация обучения | |
logger.info("Training completed") | |