import numpy as np import json img_codes = np.load("data/image_codes.npy") captions = json.load(open('data/captions_tokenized.json')) for img_i in range(len(captions)): for caption_i inrange(len(captions[img_i])): sentence = captions[img_i][caption_i] captions[img_i][caption_i] = ["#START#"] + sentence.split(' ') + ["#END#"] class CaptionNet(nn.Module): def__init__(self, n_tokens=n_tokens, emb_size=128, lstm_units=256, cnn_feature_size=2048): """ A recurrent 'head' network for image captioning. See scheme above. """ super().__init__() # a layer that converts conv features to initial_h (h_0) and initial_c (c_0) self.cnn_to_h0 = nn.Linear(cnn_feature_size, lstm_units) self.cnn_to_c0 = nn.Linear(cnn_feature_size, lstm_units) # create embedding for input words. Use the parameters (e.g. emb_size). # YOUR CODE HERE # lstm: create a recurrent core of your network. Use either LSTMCell or just LSTM. # In the latter case (nn.LSTM), make sure batch_first=True # YOUR CODE HERE # create logits: linear layer that takes lstm hidden state as input and computes one number per token # YOUR CODE HERE defforward(self, image_vectors, captions_ix): """ Apply the network in training mode. :param image_vectors: torch tensor containing inception vectors. shape: [batch, cnn_feature_size] :param captions_ix: torch tensor containing captions as matrix. shape: [batch, word_i]. padded with pad_ix :returns: logits for next token at each tick, shape: [batch, word_i, n_tokens] """ self.lstm.flatten_parameters() initial_cell = self.cnn_to_c0(image_vectors) initial_hid = self.cnn_to_h0(image_vectors) # compute embeddings for captions_ix # YOUR CODE HERE # apply recurrent layer to captions_emb. # 1. initialize lstm state with initial_* from above # 2. feed it with captions. Mind the dimension order in docstring # 3. compute logits for next token probabilities # Note: if you used nn.LSTM, you can just give it (initial_cell[None], initial_hid[None]) as second arg # lstm_out should be lstm hidden state sequence of shape [batch, caption_length, lstm_units] # YOUR CODE HERE # compute logits from lstm_out # YOUR CODE HERE #for reference and more detail go to ---> https://colab.research.google.com/github/hse-aml/intro-to-dl-pytorch/blob/main/week06/week06_final_project_image_captioning.ipynb#scrollTo=x_wH1bYu-QK8

import numpy as np import json img_codes = np.load("data/image_codes.npy") captions = json.load(open('data/captions_tokenized.json')) for img_i in range(len(captions)): for caption_i inrange(len(captions[img_i])): sentence = captions[img_i][caption_i] captions[img_i][caption_i] = ["#START#"] + sentence.split(' ') + ["#END#"] class CaptionNet(nn.Module): definit(self, n_tokens=n_tokens, emb_size=128, lstm_units=256, cnn_feature_size=2048): """ A recurrent 'head' network for image captioning. See scheme above. """ super().init() # a layer that converts conv features to initial_h (h_0) and initial_c (c_0) self.cnn_to_h0 = nn.Linear(cnn_feature_size, lstm_units) self.cnn_to_c0 = nn.Linear(cnn_feature_size, lstm_units) # create embedding for input words. Use the parameters (e.g. emb_size). # YOUR CODE HERE # lstm: create a recurrent core of your network. Use either LSTMCell or just LSTM. # In the latter case (nn.LSTM), make sure batch_first=True # YOUR CODE HERE # create logits: linear layer that takes lstm hidden state as input and computes one number per token # YOUR CODE HERE defforward(self, image_vectors, captions_ix): """ Apply the network in training mode. :param image_vectors: torch tensor containing inception vectors. shape: [batch, cnn_feature_size] :param captions_ix: torch tensor containing captions as matrix. shape: [batch, word_i]. padded with pad_ix :returns: logits for next token at each tick, shape: [batch, word_i, n_tokens] """ self.lstm.flatten_parameters() initial_cell = self.cnn_to_c0(image_vectors) initial_hid = self.cnn_to_h0(image_vectors) # compute embeddings for captions_ix # YOUR CODE HERE # apply recurrent layer to captions_emb. # 1. initialize lstm state with initial_* from above # 2. feed it with captions. Mind the dimension order in docstring # 3. compute logits for next token probabilities # Note: if you used nn.LSTM, you can just give it (initial_cell[None], initial_hid[None]) as second arg # lstm_out should be lstm hidden state sequence of shape [batch, caption_length, lstm_units] # YOUR CODE HERE # compute logits from lstm_out # YOUR CODE HERE #for reference and more detail go to ---> https://colab.research.google.com/github/hse-aml/intro-to-dl-pytorch/blob/main/week06/week06_final_project_image_captioning.ipynb#scrollTo=x_wH1bYu-QK8

Computer Networking: A Top-Down Approach (7th Edition)

7th Edition

ISBN:9780133594140

Author:James Kurose, Keith Ross

Publisher:James Kurose, Keith Ross

Chapter1: Computer Networks And The Internet

Section: Chapter Questions

Problem R1RQ: What is the difference between a host and an end system? List several different types of end...

See similar textbooks

Related questions

Question

import numpy as np

import json

img_codes = np.load("data/image_codes.npy")

captions = json.load(open('data/captions_tokenized.json'))

for img_i in range(len(captions)):

for caption_i inrange(len(captions[img_i])):

sentence = captions[img_i][caption_i]

captions[img_i][caption_i] = ["#START#"] + sentence.split(' ') + ["#END#"]

class CaptionNet(nn.Module):

def__init__(self, n_tokens=n_tokens, emb_size=128, lstm_units=256, cnn_feature_size=2048):

""" A recurrent 'head' network for image captioning. See scheme above. """

super().__init__()

# a layer that converts conv features to initial_h (h_0) and initial_c (c_0)

self.cnn_to_h0 = nn.Linear(cnn_feature_size, lstm_units)

self.cnn_to_c0 = nn.Linear(cnn_feature_size, lstm_units)

# create embedding for input words. Use the parameters (e.g. emb_size).

# YOUR CODE HERE

# lstm: create a recurrent core of your network. Use either LSTMCell or just LSTM.

# In the latter case (nn.LSTM), make sure batch_first=True

# YOUR CODE HERE

# create logits: linear layer that takes lstm hidden state as input and computes one number per token

# YOUR CODE HERE

defforward(self, image_vectors, captions_ix):

"""

Apply the network in training mode.

:param image_vectors: torch tensor containing inception vectors. shape: [batch, cnn_feature_size]

:param captions_ix: torch tensor containing captions as matrix. shape: [batch, word_i].

padded with pad_ix

:returns: logits for next token at each tick, shape: [batch, word_i, n_tokens]

"""

self.lstm.flatten_parameters()

initial_cell = self.cnn_to_c0(image_vectors)

initial_hid = self.cnn_to_h0(image_vectors)

# compute embeddings for captions_ix

# YOUR CODE HERE

# apply recurrent layer to captions_emb.

# 1. initialize lstm state with initial_* from above

# 2. feed it with captions. Mind the dimension order in docstring

# 3. compute logits for next token probabilities

# Note: if you used nn.LSTM, you can just give it (initial_cell[None], initial_hid[None]) as second arg

# lstm_out should be lstm hidden state sequence of shape [batch, caption_length, lstm_units]

# YOUR CODE HERE

# compute logits from lstm_out

# YOUR CODE HERE

#for reference and more detail go to ---> https://colab.research.google.com/github/hse-aml/intro-to-dl-pytorch/blob/main/week06/week06_final_project_image_captioning.ipynb#scrollTo=x_wH1bYu-QK8

Expert Solution

Trending now

This is a popular solution!

Step by step

Solved in 3 steps with 1 images

SEE SOLUTION Check out a sample Q&A here