A concise but complete implementation of CLIP with various experimental improvements from recent papers
$ pip install x-clip
import torch
from x_clip import CLIP
clip = CLIP(
dim_text = 512,
dim_image = 512,
dim_latent = 512,
num_text_tokens = 10000,
text_enc_depth = 6,
text_seq_len = 256,
text_heads = 8,
num_visual_tokens = 512,
visual_enc_depth = 6,
visual_image_size = 256,
visual_patch_size = 32,
visual_heads = 8,
use_all_token_embeds = True, # whether to use fine-grained contrastive learning (FILIP)
decoupled_contrastive_learning = True, # use decoupled contrastive learning (DCL) objective function, removing positive pairs from the denominator of the InfoNCE loss (CLOOB + DCL)
extra_latent_projection = True # whether to use separate projections for text-to-image vs image-to-text comparisons (CLOOB)
)
# mock data
text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)
mask = torch.ones_like(text).bool()
# train
loss = clip(
text,
images,
text_mask = mask, # mask for text
freeze_image_encoder = False, # whether to freeze image encoder if using a pretrained image net, proposed by LiT paper
return_loss = True # needs to be set to True to return contrastive loss
)
loss.backward()
@misc{radford2021learning,
title = {Learning Transferable Visual Models From Natural Language Supervision},
author = {Alec Radford and Jong Wook Kim and Chris Hallacy and Aditya Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
year = {2021},
eprint = {2103.00020},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
}
@misc{yao2021filip,
title = {FILIP: Fine-grained Interactive Language-Image Pre-Training},
author = {Lewei Yao and Runhui Huang and Lu Hou and Guansong Lu and Minzhe Niu and Hang Xu and Xiaodan Liang and Zhenguo Li and Xin Jiang and Chunjing Xu},
year = {2021},
eprint = {2111.07783},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
}
@misc{fürst2021cloob,
title = {CLOOB: Modern Hopfield Networks with InfoLOOB Outperform CLIP},
author = {Andreas Fürst and Elisabeth Rumetshofer and Viet Tran and Hubert Ramsauer and Fei Tang and Johannes Lehner and David Kreil and Michael Kopp and Günter Klambauer and Angela Bitto-Nemling and Sepp Hochreiter},
year = {2021},
eprint = {2110.11316},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
}
@misc{yeh2021decoupled,
title = {Decoupled Contrastive Learning},
author = {Chun-Hsiao Yeh and Cheng-Yao Hong and Yen-Chi Hsu and Tyng-Luh Liu and Yubei Chen and Yann LeCun},
year = {2021},
eprint = {2110.06848},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
}
@misc{zhai2021lit,
title = {LiT: Zero-Shot Transfer with Locked-image Text Tuning},
author = {Xiaohua Zhai and Xiao Wang and Basil Mustafa and Andreas Steiner and Daniel Keysers and Alexander Kolesnikov and Lucas Beyer},
year = {2021},
eprint = {2111.07991},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
}