HW2 - Single View to 3D

1. Exploring Loss Functions

1.1 Voxel Loss

def voxel_loss(voxel_src,voxel_tgt):
	# implement some loss for binary voxel grids
	criterion = nn.BCEWithLogitsLoss()
	loss = criterion(voxel_src,voxel_tgt)
	return loss

1.2 Point Cloud Loss

def chamfer_loss(point_cloud_src,point_cloud_tgt):
	# point_cloud_src, point_cloud_src: b x n_points x 3  
	# find the closest point for each point cloud set to its nearest neighbor
	# find the mean of each set, and add the two means

	#turn into float for mean square calculation
	A = point_cloud_src.float()
	B = point_cloud_tgt.float()

	#use knn_points to calculate nearest K distance, no need to return the index
	dist_AB, idx_AB, _ = knn_points(A,B,K=1)
	dist_BA, idx_BA, _ = knn_points(B,A,K=1)

	loss_chamfer = dist_AB.mean()+dist_BA.mean()
	return loss_chamfer

1.3 Mesh Loss (Smoothness)

def smoothness_loss(mesh_src):
	loss_laplacian = mesh_laplacian_smoothing(mesh_src,method="uniform")
	return loss_laplacian

2. Reconstructing 3D from Single View

2.1 Image to Voxel Grid

📌

half the channel while double the spatial dimension is common, therefore us such strategy, and kernel size 4, stride 2 and padding 1 is the formula for double the spatial

# define decoder
if args.type == "vox":
    # Input: b x 512
    # Output: b x 32 x 32 x 32
    self.decoder = nn.Sequential(
        nn.Linear(512, 64 * 2 * 2 * 2),  # 512 -> 512 (rearrange features)
        nn.ReLU(),
        nn.Unflatten(1, (64, 2, 2, 2)),  # (B,512) -> (B,64,2,2,2)
        nn.ConvTranspose3d(in_channels=64, out_channels=32, kernel_size=4, stride=2, padding=1),  #(B,64,2,2,2) > (B,32,4,4,4)
        nn.ReLU(),
        nn.ConvTranspose3d(32,16,kernel_size=4, stride=2, padding=1), #(B,32,4,4,4) -> (B,16,8,8,8)
        nn.ReLU(),
        nn.ConvTranspose3d(16,8,kernel_size=4, stride=2, padding=1), #(B,16,8,8,8) -> (B,8,16,16,16)
        nn.ReLU(),
        nn.ConvTranspose3d(8,1,kernel_size=4, stride=2, padding=1), #(B,8,16,16,16) -> (B,1,32,32,32)
    )            

# Forward
# call decoder
if args.type == "vox":
    voxels_pred = self.decoder(encoded_feat)  # (B, 1, 32, 32, 32)
    return voxels_pred

2.2 Image to Point Cloud

self.decoder = nn.Sequential(
    nn.Linear(512, 1024),
    nn.ReLU(),
    nn.Linear(1024, 2048),
    nn.ReLU(),
    nn.Linear(2048, args.n_points * 3),
) 

2.3 Image to Mesh

num_verts = mesh_pred.verts_packed().shape[0]
self.decoder = nn.Sequential(
    nn.Linear(512, 1024),
    nn.ReLU(),
    nn.Linear(1024, 2048),
    nn.ReLU(),
    nn.Linear(2048, num_verts * 3),
)            

2.4 Quantitative Comparisons

2.5 Hyperparameter Variation

2.6 Model Interpretation (via Gradient-Based Saliency Maps)

# minimal implementation snippet
images_gt.requires_grad = True
prediction = model(images_gt, args)
loss = calculate_loss(prediction, ground_truth)
loss.backward()
gradients = images_gt.grad.abs().mean(dim=-1)  # Average across RGB
plt.imshow(gradients[0].cpu())

# referenced Simonyan, K., Vedaldi, A., & Zisserman, A. (2013). "Deep Inside Convolutional Networks: Visualising Image Classification Models and Saliency Maps."

3. Exploring Other Architectures / Datasets

3.1 Implicit Network

# decoder design 
elif args.type == "implicit":
    # Implicit decoder: takes image feature (512) + 3D coordinate (3) -> oppupancy (1)
    self.decoder = nn.Sequential(
        nn.Linear(515,256),
        nn.ReLU(),
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Linear(64, 1)  # Output: single occupancy value
    )   
# dataloading during training, get random points in the normalized space
  # Sample N random coordinates per image
  B = images.shape[0]
  N = 1000  # Number of points to sample
  coords = torch.rand(B, N, 3) * 2 - 1  # Random in [-1,1]³
  voxels = feed_dict["voxels"].float()
  # Return tuple
  ground_truth_3d = (voxels, coords)
# Special forward pass handling for implicit
voxels_gt, coords = ground_truth_3d

# Manual forward pass
B = images_gt.shape[0]
N = coords.shape[1]

# Encode images
images_normalize = model.normalize(images_gt.permute(0,3,1,2))
encoded_feat = model.encoder(images_normalize).squeeze(-1).squeeze(-1)

# Expand features and concatenate with coords
features_expanded = encoded_feat.unsqueeze(1).expand(-1, N, -1)
decoder_input = torch.cat([features_expanded, coords], dim=-1)
decoder_input_flat = decoder_input.reshape(B * N, 515)

# Decoder forward
prediction_3d = model.decoder(decoder_input_flat).reshape(B, N, 1)
Best loss achieved: 0.12358080
Final epoch loss: 0.12358080
Total epochs: 5
Total iterations: 3810
Final loss: 0.0920
Minimum loss: 0.0634
Average loss: 0.1200
Total iterations: 17236
Total time: 29192.9 seconds