Binary Image Cnn Autoender
PyTorch CNNによる二値画像AutoEncoderの解説
二値画像(白黒画像)を扱うAutoEncoderのサンプルコードを解説します。
完全なサンプルコード
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
# デバイスの設定
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# MPSが利用可能かチェック
if torch.backends.mps.is_available():
device = torch.device("mps")
# ハイパーパラメータ
BATCH_SIZE = 128
LEARNING_RATE = 0.001
EPOCHS = 3
LATENT_DIM = 32 # 潜在空間の次元数
# データの準備(MNISTを例に使用)
transform = transforms.Compose([
transforms.ToTensor(),
#transforms.Normalize((0.5,), (0.5,)) # -1~1の範囲に正規化
transforms.Lambda(lambda x: (x > 0.5).float()) # 二値化
])
train_dataset = datasets.MNIST(
root='~/.pytorch/data',
train=True,
download=True,
transform=transform
)
train_loader = DataLoader(
train_dataset,
batch_size=BATCH_SIZE,
shuffle=True
)
# Encoderの定義
class Encoder(nn.Module):
def __init__(self, latent_dim):
super(Encoder, self).__init__()
# 畳み込み層
self.conv_layers = nn.Sequential(
# 入力: 1x28x28
nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1), # 32x14x14
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1), # 64x7x7
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1), # 128x4x4
nn.BatchNorm2d(128),
nn.ReLU(),
)
# 全結合層で潜在空間へ
self.fc = nn.Linear(128 * 4 * 4, latent_dim)
def forward(self, x):
x = self.conv_layers(x)
x = x.view(x.size(0), -1) # Flatten
x = self.fc(x)
return x
# Decoderの定義
class Decoder(nn.Module):
def __init__(self, latent_dim):
super(Decoder, self).__init__()
# 潜在空間から特徴マップへ
self.fc = nn.Linear(latent_dim, 128 * 4 * 4)
# 転置畳み込み層(逆畳み込み)
self.deconv_layers = nn.Sequential(
# 入力: 128x4x4
nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1), # 64x8x8
nn.BatchNorm2d(64),
nn.ReLU(),
nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1), # 32x16x16
nn.BatchNorm2d(32),
nn.ReLU(),
nn.ConvTranspose2d(32, 1, kernel_size=4, stride=2, padding=3), # 1x28x28
#nn.Tanh() # -1~1の範囲に出力
nn.Sigmoid() # 0-1の範囲に出力を制限
)
def forward(self, x):
x = self.fc(x)
x = x.view(x.size(0), 128, 4, 4) # Reshape
x = self.deconv_layers(x)
return x
# AutoEncoderの定義
class AutoEncoder(nn.Module):
def __init__(self, latent_dim):
super(AutoEncoder, self).__init__()
self.encoder = Encoder(latent_dim)
self.decoder = Decoder(latent_dim)
def forward(self, x):
latent = self.encoder(x)
reconstructed = self.decoder(latent)
return reconstructed
# モデルの初期化
model = AutoEncoder(LATENT_DIM).to(device)
# 損失関数と最適化手法
# criterion = nn.MSELoss() # 平均二乗誤差
# 損失関数(二値交差エントロピー)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# 訓練ループ
def train(model, train_loader, criterion, optimizer, epochs):
model.train()
losses = []
for epoch in range(epochs):
epoch_loss = 0
for batch_idx, (data, _) in enumerate(train_loader):
data = data.to(device)
# 勾配の初期化
optimizer.zero_grad()
# 順伝播
reconstructed = model(data)
# 損失計算
loss = criterion(reconstructed, data)
# 逆伝播
loss.backward()
# パラメータ更新
optimizer.step()
epoch_loss += loss.item()
if batch_idx % 100 == 0:
print(f'Epoch [{epoch+1}/{epochs}], Step [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}')
avg_loss = epoch_loss / len(train_loader)
losses.append(avg_loss)
print(f'Epoch [{epoch+1}/{epochs}], Average Loss: {avg_loss:.4f}')
return losses
# 訓練実行
losses = train(model, train_loader, criterion, optimizer, EPOCHS)
# 結果の可視化
def visualize_results(model, test_loader, num_images=10):
model.eval()
with torch.no_grad():
data, _ = next(iter(test_loader))
data = data[:num_images].to(device)
reconstructed = model(data)
# CPU に移動して表示
data = data.cpu()
reconstructed = reconstructed.cpu()
fig, axes = plt.subplots(2, num_images, figsize=(15, 3))
for i in range(num_images):
# 元画像
axes[0, i].imshow(data[i].squeeze(), cmap='gray')
axes[0, i].axis('off')
if i == 0:
axes[0, i].set_title('Original', fontsize=10)
# 再構成画像
axes[1, i].imshow(reconstructed[i].squeeze(), cmap='gray')
axes[1, i].axis('off')
if i == 0:
axes[1, i].set_title('Reconstructed', fontsize=10)
plt.tight_layout()
plt.savefig('autoencoder_results.png')
plt.show()
# テストデータで可視化
test_dataset = datasets.MNIST(
root='~/.pytorch/data',
train=False,
download=True,
transform=transform
)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
visualize_results(model, test_loader)
# 損失の推移をプロット
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.grid(True)
plt.savefig('autoencoder_loss.png')
plt.show()
主要部分の解説
1. Encoder(符号化器)
nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1)
- 畳み込み層で画像の特徴を抽出
stride=2で画像サイズを半分に縮小- 徐々にチャネル数を増やして特徴を豊かに
2. Decoder(復号化器)
nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1)
- 転置畳み込みで画像を拡大
- Encoderの逆操作を実行
- 最終的に元の画像サイズに復元
3. 損失関数
criterion = nn.MSELoss()
- 元画像と再構成画像の差を最小化
- 二値画像には
BCELossも使用可能
このコードを実行すると、画像の圧縮・復元が学習され、ノイズ除去や特徴抽出にも応用できます。
4. 実行の結果
% python3 binary-image-cnn-autoender.py
Epoch [1/3], Step [0/469], Loss: 0.5709
Epoch [1/3], Step [100/469], Loss: 0.1056
Epoch [1/3], Step [200/469], Loss: 0.0652
Epoch [1/3], Step [300/469], Loss: 0.0550
Epoch [1/3], Step [400/469], Loss: 0.0507
Epoch [1/3], Average Loss: 0.0886
Epoch [2/3], Step [0/469], Loss: 0.0445
Epoch [2/3], Step [100/469], Loss: 0.0387
Epoch [2/3], Step [200/469], Loss: 0.0380
Epoch [2/3], Step [300/469], Loss: 0.0394
Epoch [2/3], Step [400/469], Loss: 0.0358
Epoch [2/3], Average Loss: 0.0395
Epoch [3/3], Step [0/469], Loss: 0.0347
Epoch [3/3], Step [100/469], Loss: 0.0362
Epoch [3/3], Step [200/469], Loss: 0.0373
Epoch [3/3], Step [300/469], Loss: 0.0373
Epoch [3/3], Step [400/469], Loss: 0.0351
Epoch [3/3], Average Loss: 0.0344
5. 元画像と再構成画像

6. 損失の推移
