import torch
import math

# Set up PyTorch data type and device (CPU or GPU)
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment to use GPU if available

# 1. Generate input data (x) in [-π, π] and corresponding target output (y = sin(x))
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)  # This is what we want to approximate!

# 2. Randomly initialize the weights (polynomial coefficients)
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # 3. Forward pass: compute predicted y using the current coefficients
    #     y_pred = a + b*x + c*x^2 + d*x^3  (a cubic polynomial)
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # 4. Compute loss: sum of squared differences between prediction and true values
    #    (This is called the "Mean Squared Error" loss, except without the mean)
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # 5. Manually compute gradients for each weight
    grad_y_pred = 2.0 * (y_pred - y)                 # Derivative of loss w.r.t. y_pred
    grad_a = grad_y_pred.sum()                       # Derivative for a
    grad_b = (grad_y_pred * x).sum()                 # Derivative for b
    grad_c = (grad_y_pred * x ** 2).sum()            # Derivative for c
    grad_d = (grad_y_pred * x ** 3).sum()            # Derivative for d

    # 6. Update each weight by taking a small step in the opposite direction of the gradient
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')