salesforce · VickieRanran · Apr 18, 2025 · Apr 18, 2025 · Apr 19, 2025 · Apr 19, 2025
diff --git a/ALBEF.ipynb b/ALBEF.ipynb
diff --git a/ALBEF_finetune.ipynb b/ALBEF_finetune.ipynb
diff --git a/Pretrain.ipynb b/Pretrain.ipynb
diff --git a/VQA.yaml b/VQA.yaml
diff --git a/configs/VQA.yaml b/configs/VQA.yaml
@@ -1,12 +1,13 @@
 train_file: ['data/vqa_train.json',
-             'data/vqa_val.json',   
-             'data/vg_qa.json']
+            'data/vqa_val.json',   
+            #  'data/vg_qa.json'
+             ]
 
 test_file: ['data/vqa_test.json']
 answer_list: 'data/answer_list.json'
 
-vqa_root: '/export/share/datasets/vision/VQA/Images/mscoco/' #train2014/
-vg_root: '/export/share/datasets/vision/visual-genome/'  #image/
+vqa_root: 'data/' #train2014/
+vg_root: 'data/'  #image/
 
 image_res: 384
 batch_size_train: 32 

diff --git a/configs/config_bert.json b/configs/config_bert.json
@@ -12,7 +12,7 @@
   "max_position_embeddings": 512,
   "model_type": "bert",
   "num_attention_heads": 12,
-  "num_hidden_layers": 12,
+  "num_hidden_layers": 3,
   "pad_token_id": 0,
   "type_vocab_size": 2,
   "vocab_size": 30522,

diff --git a/dataset/utils.py b/dataset/utils.py
@@ -116,7 +116,7 @@ def save_result(result, result_dir, filename, is_json=True, is_list=True):
         final_result_file = os.path.join(result_dir, '%s.pth'%filename)
         torch.save(result,result_file)     
 
-    dist.barrier()
+    # dist.barrier()
 
     if utils.is_main_process():   
         # combine results from all processes
@@ -141,7 +141,7 @@ def save_result(result, result_dir, filename, is_json=True, is_list=True):
             torch.save(result,final_result_file)     
 
         print('result file saved to %s'%final_result_file)
-    dist.barrier()        
+    # dist.barrier()        
     return final_result_file
 
 

diff --git a/models/model_vqa.py b/models/model_vqa.py
@@ -21,20 +21,20 @@ def __init__(self,
         self.distill = config['distill']
 
         self.visual_encoder = VisionTransformer(
-            img_size=config['image_res'], patch_size=16, embed_dim=768, depth=12, num_heads=12, 
+            img_size=config['image_res'], patch_size=16, embed_dim=768, depth=6, num_heads=12, 
             mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6))    
 
         config_encoder = BertConfig.from_json_file(config['bert_config'])   
         self.text_encoder = BertModel.from_pretrained(text_encoder, config=config_encoder, add_pooling_layer=False)  
 
         config_decoder = BertConfig.from_json_file(config['bert_config'])
         config_decoder.fusion_layer = 0
-        config_decoder.num_hidden_layers = 6
+        config_decoder.num_hidden_layers = 3
         self.text_decoder = BertLMHeadModel.from_pretrained(text_decoder, config=config_decoder)    
 
         if self.distill:
             self.visual_encoder_m = VisionTransformer(
-                img_size=config['image_res'], patch_size=16, embed_dim=768, depth=12, num_heads=12, 
+                img_size=config['image_res'], patch_size=16, embed_dim=768, depth=6, num_heads=12, 
                 mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6))             
             self.text_encoder_m = BertModel.from_pretrained(text_encoder, config=config_encoder, add_pooling_layer=False)   
             self.text_decoder_m = BertLMHeadModel.from_pretrained(text_decoder, config=config_decoder)   

diff --git a/models/xbert.py b/models/xbert.py
@@ -869,7 +869,7 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
+        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
@@ -1362,7 +1362,7 @@ def set_output_embeddings(self, new_embeddings):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
+        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1567,7 +1567,7 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
+        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1651,7 +1651,7 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
+        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1746,7 +1746,7 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
+        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
@@ -1836,7 +1836,7 @@ def __init__(self, config):
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
+        processor_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,

diff --git a/output_eval/vqa/config.yaml b/output_eval/vqa/config.yaml
@@ -0,0 +1,17 @@
+alpha: 0.4
+answer_list: data/answer_list.json
+batch_size_test: 32
+batch_size_train: 32
+bert_config: configs/config_bert.json
+distill: true
+eos: '[SEP]'
+image_res: 384
+k_test: 128
+optimizer: {lr: 2e-05, opt: adamW, weight_decay: 0.02}
+schedular: {cooldown_epochs: 0, decay_rate: 1, epochs: 8, lr: 2e-05, min_lr: 1e-06,
+  sched: cosine, warmup_epochs: 4, warmup_lr: 1e-05}
+test_file: [data/vqa_test.json]
+train_file: [data/vqa_train.json]
+vg_root: data/
+vqa_root: data/
+warm_up: true
diff --git a/output_eval/vqa/result/vqa_result_epoch0.json b/output_eval/vqa/result/vqa_result_epoch0.json
diff --git a/output_eval/vqa/result/vqa_result_epoch0_rank0.json b/output_eval/vqa/result/vqa_result_epoch0_rank0.json
diff --git a/output_finetune/config.yaml b/output_finetune/config.yaml
@@ -0,0 +1,17 @@
+alpha: 0.4
+answer_list: data/answer_list.json
+batch_size_test: 16
+batch_size_train: 32
+bert_config: configs/config_bert.json
+distill: true
+eos: '[SEP]'
+image_res: 384
+k_test: 128
+optimizer: {lr: 2e-05, opt: adamW, weight_decay: 0.02}
+schedular: {cooldown_epochs: 0, decay_rate: 1, epochs: 8, lr: 2e-05, min_lr: 1e-06,
+  sched: cosine, warmup_epochs: 4, warmup_lr: 1e-05}
+test_file: [data/vqa_test.json]
+train_file: [data/vqa_train.json, data/vqa_val.json]
+vg_root: data/
+vqa_root: data/
+warm_up: true
diff --git a/output_finetune/log.txt b/output_finetune/log.txt
@@ -0,0 +1,8 @@
+{"train_lr": "0.000", "train_loss": "3.903", "epoch": 0}
+{"train_lr": "0.000", "train_loss": "3.409", "epoch": 1}
+{"train_lr": "0.000", "train_loss": "3.258", "epoch": 2}
+{"train_lr": "0.000", "train_loss": "3.159", "epoch": 3}
+{"train_lr": "0.000", "train_loss": "3.085", "epoch": 4}
+{"train_lr": "0.000", "train_loss": "3.027", "epoch": 5}
+{"train_lr": "0.000", "train_loss": "2.987", "epoch": 6}
+{"train_lr": "0.000", "train_loss": "2.961", "epoch": 7}
diff --git a/output_finetune/result/vqa_result_epoch0.json b/output_finetune/result/vqa_result_epoch0.json
diff --git a/output_finetune/result/vqa_result_epoch0_rank0.json b/output_finetune/result/vqa_result_epoch0_rank0.json
diff --git a/output_finetune/result/vqa_result_epoch7.json b/output_finetune/result/vqa_result_epoch7.json
diff --git a/output_finetune/result/vqa_result_epoch7_rank0.json b/output_finetune/result/vqa_result_epoch7_rank0.json