In [2]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on", device)


Running on cuda


In [3]:
with open("gatunamn.txt", "r", encoding="UTF-8") as f:
    names = f.read().split("\n")[2:]

n_samples = len(names)
n_samples

495319

In [4]:
longest_name = max(names, key=len)
longest_name = len(longest_name)
longest_name

52

In [5]:
unique_chars = list(set("".join(names + ["#"])))
n_unique_chars = len(unique_chars)
print(unique_chars[:10], n_unique_chars)

c2i = {c: i for i, c in enumerate(unique_chars)}
i2c = {i: c for i, c in enumerate(unique_chars)}

number_dataset = [np.array([c2i[c] for c in name]) for name in names]
print(names[0], number_dataset[0])


['D', 'H', '.', '3', 'è', 'F', '(', '7', 'Å', '-'] 93
Tångavägen [32 25 70 54 52 38 24 54 48 70]


In [6]:
class AutoEncoderGatunamn(nn.Module):
    def __init__(self, embed_size):
        super(AutoEncoderGatunamn, self).__init__()

        self._encoder = nn.Sequential(
            nn.Linear(embed_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
        )
        self._decoder = nn.Sequential(
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, embed_size),
        )
    
    def forward(self, x):
        x = self._encoder(x)
        x = self._decoder(x)
        return x
    
    def encode(self, x):
        return self._encoder(x)
    

In [7]:
pad_size = int(1.2 * longest_name)
padded_number_dataset = [F.pad(torch.Tensor(name), (0, pad_size - len(name)), "constant", c2i["#"]).view(-1, 1).long() for name in number_dataset]
print(names[0], number_dataset[0], padded_number_dataset[0].shape)


Tångavägen [32 25 70 54 52 38 24 54 48 70] torch.Size([62, 1])


In [8]:
from torch.utils.data import DataLoader, Dataset


class GatunamnDataset(Dataset):
    def __init__(self, data):
        self._data = data
        self._pad_symbol = c2i["#"]
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        x = self._data[idx]
        return x
    

X = torch.cat(padded_number_dataset, dim=1).float().T
X = -1 + (X - X.min()) * 2 / (X.max() - X.min())
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)


torch.Size([396255, 62]) torch.Size([99064, 62])


In [9]:
model = AutoEncoderGatunamn(pad_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

train_loader = DataLoader(GatunamnDataset(X_train), batch_size=128, shuffle=True)
test_loader = DataLoader(GatunamnDataset(X_test), batch_size=32, shuffle=False)

epochs = 10

for epoch in range(epochs):
    model.train()
    running_train_loss = 0.0
    running_test_loss = 0.0

    for batch in train_loader:
        optimizer.zero_grad()
        batch = batch.to(device)
        output = model(batch)
        train_loss = criterion(output.view(-1), batch.float().view(-1))
        running_train_loss += train_loss.item()
        train_loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            output = model(batch)
            test_loss = criterion(output.view(-1), batch.float().view(-1))
            running_test_loss += test_loss.item()

    running_train_loss /= len(train_loader)
    running_test_loss /= len(test_loader)

    print(f"Epoch {epoch + 1}/{epochs}, Train loss: {running_train_loss:.6f}, Test loss: {running_test_loss:.6f}")


Epoch 1/10, Train loss: 0.019771, Test loss: 0.014532
Epoch 2/10, Train loss: 0.013899, Test loss: 0.013421
Epoch 3/10, Train loss: 0.012999, Test loss: 0.012390
Epoch 4/10, Train loss: 0.011897, Test loss: 0.011301
Epoch 5/10, Train loss: 0.011103, Test loss: 0.010972
Epoch 6/10, Train loss: 0.010572, Test loss: 0.010272
Epoch 7/10, Train loss: 0.010146, Test loss: 0.009963
Epoch 8/10, Train loss: 0.009796, Test loss: 0.009562
Epoch 9/10, Train loss: 0.009520, Test loss: 0.009207
Epoch 10/10, Train loss: 0.009149, Test loss: 0.008572


In [10]:
X_max = max(c2i.values())
X_min = min(c2i.values())

print(X_max, X_min)

x = "Duettväg"
xx = np.array([c2i[c] for c in x])
xx = F.pad(torch.Tensor(xx), (0, pad_size - len(xx)), "constant", c2i["#"]).view(-1, 1).long()
x_test = -1 + (xx - X_min) * 2 / (X_max - X_min)
x_test_out = model.encode(x_test.T.to(device)).cpu().detach().numpy()

y = "Duettvägen"
yy = np.array([c2i[c] for c in y])
yy = F.pad(torch.Tensor(yy), (0, pad_size - len(yy)), "constant", c2i["#"]).view(-1, 1).long()
y_test = -1 + (yy - X_min) * 2 / (X_max - X_min)
y_test_out = model.encode(y_test.T.to(device)).cpu().detach().numpy()


92 0


In [11]:
x_test_out-y_test_out

array([[-0.00142717,  0.00262201, -0.00808334,  0.        ,  0.        ,
         0.        ,  0.        , -0.00378311]], dtype=float32)