diff --git a/comfy/clip_config_bigg.json b/comfy/clip_config_bigg.json index 32d82ff3..35261dee 100644 --- a/comfy/clip_config_bigg.json +++ b/comfy/clip_config_bigg.json @@ -5,7 +5,7 @@ "attention_dropout": 0.0, "bos_token_id": 0, "dropout": 0.0, - "eos_token_id": 2, + "eos_token_id": 49407, "hidden_act": "gelu", "hidden_size": 1280, "initializer_factor": 1.0, diff --git a/comfy/clip_model.py b/comfy/clip_model.py index 14f43c56..ab775309 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -87,6 +87,7 @@ class CLIPTextModel_(torch.nn.Module): heads = config_dict["num_attention_heads"] intermediate_size = config_dict["intermediate_size"] intermediate_activation = config_dict["hidden_act"] + self.eos_token_id = config_dict["eos_token_id"] super().__init__() self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device) @@ -111,7 +112,7 @@ class CLIPTextModel_(torch.nn.Module): if i is not None and final_layer_norm_intermediate: i = self.final_layer_norm(i) - pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),] + pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),] return x, i, pooled_output class CLIPTextModel(torch.nn.Module): diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index c7bc1e4d..f209bed4 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -140,15 +140,13 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): def set_up_textual_embeddings(self, tokens, current_embeds): out_tokens = [] - next_new_token = token_dict_size = current_embeds.weight.shape[0] - 1 + next_new_token = token_dict_size = current_embeds.weight.shape[0] embedding_weights = [] for x in tokens: tokens_temp = [] for y in x: if isinstance(y, numbers.Integral): - if y == token_dict_size: #EOS token - y = -1 tokens_temp += [int(y)] else: if y.shape[0] == current_embeds.weight.shape[1]: @@ -164,11 +162,10 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): n = token_dict_size if len(embedding_weights) > 0: new_embedding = torch.nn.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype) - new_embedding.weight[:token_dict_size] = current_embeds.weight[:-1] + new_embedding.weight[:token_dict_size] = current_embeds.weight for x in embedding_weights: new_embedding.weight[n] = x n += 1 - new_embedding.weight[n] = current_embeds.weight[-1] #EOS embedding self.transformer.set_input_embeddings(new_embedding) processed_tokens = [] diff --git a/comfy/sd1_clip_config.json b/comfy/sd1_clip_config.json index 0158a1fd..3ba8c6b5 100644 --- a/comfy/sd1_clip_config.json +++ b/comfy/sd1_clip_config.json @@ -6,7 +6,7 @@ "attention_dropout": 0.0, "bos_token_id": 0, "dropout": 0.0, - "eos_token_id": 2, + "eos_token_id": 49407, "hidden_act": "quick_gelu", "hidden_size": 768, "initializer_factor": 1.0, diff --git a/comfy/text_encoders/sd2_clip_config.json b/comfy/text_encoders/sd2_clip_config.json index 85cec832..00893cfd 100644 --- a/comfy/text_encoders/sd2_clip_config.json +++ b/comfy/text_encoders/sd2_clip_config.json @@ -5,7 +5,7 @@ "attention_dropout": 0.0, "bos_token_id": 0, "dropout": 0.0, - "eos_token_id": 2, + "eos_token_id": 49407, "hidden_act": "gelu", "hidden_size": 1024, "initializer_factor": 1.0,