Fix to #3465. Prevent, resaving of duplicate images if overwrite not specified (#3472)

* Fix to #3465. Prevent the, resaving of duplicate images if overwrite not specified

This is a fix to #3465 

Adds function compare_image_hash to do a sha256 hash comparison between an uploaded image and existing images with matching file names. 

This changes the behavior so that only images having the same filename that are actually different are saved to input, existing images are instead now opened instead of resaved with increment. 

Currently, exact duplicates with the same filename are resave saved with an incremented filename in the format:

<filename> (n).ext 

with the code: 

```
while os.path.exists(filepath): 
                        filename = f"{split[0]} ({i}){split[1]}"
                        filepath = os.path.join(full_output_folder, filename)
                        i += 1
```

This commit changes this to: 

```
while os.path.exists(filepath): 
                        if compare_image_hash(filepath, image):
                            image_is_duplicate = True
                            break
                        filename = f"{split[0]} ({i}){split[1]}"
                        filepath = os.path.join(full_output_folder, filename)
                        i += 1
```

a check for if image_is_duplicate = False is done before saving the file. 

Currently, if you load the same image of a cat named cat.jpg into the LoadImage node 3 times, you will get 3 new files in your input folder with incremented file names.

With this change, you will now only have the single copy of cat.jpg, that will be re-opened instead of re-saved. 

However if you load 3 different images of cats named cat.jpg, you will get the expected behavior of having:
cat.jpg
cat (1).jpg
cat (2).jpg

This saves space and clutter. After checking my own input folder, I have 800+ images that are duplicates that were resaved with incremented file names amounting to more than 5GB of duplicated data.

* fixed typo in expression
This commit is contained in:
shawnington 2024-07-02 01:30:33 -04:00 committed by GitHub
parent 1ef66b0955
commit 52aaee251f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -12,6 +12,7 @@ import json
import glob import glob
import struct import struct
import ssl import ssl
import hashlib
from PIL import Image, ImageOps from PIL import Image, ImageOps
from PIL.PngImagePlugin import PngInfo from PIL.PngImagePlugin import PngInfo
from io import BytesIO from io import BytesIO
@ -153,10 +154,24 @@ class PromptServer():
type_dir = folder_paths.get_output_directory() type_dir = folder_paths.get_output_directory()
return type_dir, dir_type return type_dir, dir_type
def compare_image_hash(filepath, image):
# function to compare hashes of two images to see if it already exists, fix to #3465
if os.path.exists(filepath):
a = hashlib.sha256()
b = hashlib.sha256()
with open(filepath, "rb") as f:
a.update(f.read())
b.update(image.file.read())
image.file.seek(0)
f.close()
return a.hexdigest() == b.hexdigest()
return False
def image_upload(post, image_save_function=None): def image_upload(post, image_save_function=None):
image = post.get("image") image = post.get("image")
overwrite = post.get("overwrite") overwrite = post.get("overwrite")
image_is_duplicate = False
image_upload_type = post.get("type") image_upload_type = post.get("type")
upload_dir, image_upload_type = get_dir_by_type(image_upload_type) upload_dir, image_upload_type = get_dir_by_type(image_upload_type)
@ -183,15 +198,19 @@ class PromptServer():
else: else:
i = 1 i = 1
while os.path.exists(filepath): while os.path.exists(filepath):
if compare_image_hash(filepath, image): #compare hash to prevent saving of duplicates with same name, fix for #3465
image_is_duplicate = True
break
filename = f"{split[0]} ({i}){split[1]}" filename = f"{split[0]} ({i}){split[1]}"
filepath = os.path.join(full_output_folder, filename) filepath = os.path.join(full_output_folder, filename)
i += 1 i += 1
if image_save_function is not None: if not image_is_duplicate:
image_save_function(image, post, filepath) if image_save_function is not None:
else: image_save_function(image, post, filepath)
with open(filepath, "wb") as f: else:
f.write(image.file.read()) with open(filepath, "wb") as f:
f.write(image.file.read())
return web.json_response({"name" : filename, "subfolder": subfolder, "type": image_upload_type}) return web.json_response({"name" : filename, "subfolder": subfolder, "type": image_upload_type})
else: else: