diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index 2c8603bb..a887e51b 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -50,18 +50,22 @@ def convert_to_transformers(sd, prefix): if "{}proj".format(prefix) in sd_k: sd['visual_projection.weight'] = sd.pop("{}proj".format(prefix)).transpose(0, 1) - sd = transformers_convert(sd, prefix, "vision_model.", 32) + sd = transformers_convert(sd, prefix, "vision_model.", 48) return sd def load_clipvision_from_sd(sd, prefix="", convert_keys=False): if convert_keys: sd = convert_to_transformers(sd, prefix) - if "vision_model.encoder.layers.30.layer_norm1.weight" in sd: + if "vision_model.encoder.layers.47.layer_norm1.weight" in sd: + json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json") + elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json") else: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json") clip = ClipVisionModel(json_config) m, u = clip.load_sd(sd) + if len(m) > 0: + print("missing clip vision:", m) u = set(u) keys = list(sd.keys()) for k in keys: @@ -72,4 +76,7 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): def load(ckpt_path): sd = load_torch_file(ckpt_path) - return load_clipvision_from_sd(sd) + if "visual.transformer.resblocks.0.attn.in_proj_weight" in sd: + return load_clipvision_from_sd(sd, prefix="visual.", convert_keys=True) + else: + return load_clipvision_from_sd(sd) diff --git a/comfy/clip_vision_config_g.json b/comfy/clip_vision_config_g.json new file mode 100644 index 00000000..708e7e21 --- /dev/null +++ b/comfy/clip_vision_config_g.json @@ -0,0 +1,18 @@ +{ + "attention_dropout": 0.0, + "dropout": 0.0, + "hidden_act": "gelu", + "hidden_size": 1664, + "image_size": 224, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 8192, + "layer_norm_eps": 1e-05, + "model_type": "clip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 48, + "patch_size": 14, + "projection_dim": 1280, + "torch_dtype": "float32" +}