Conversation
| EXAMPLE_DOC_STRING = """ | ||
| Examples: | ||
| ```python | ||
| >>> import torch |
There was a problem hiding this comment.
| >>> import torch | |
| >>> import PIL.Image | |
| >>> import torch |
As we use PIL.Image below when creating mask_video.
| >>> video = load_video( | ||
| ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hiker.mp4" | ||
| ... ) | ||
| >>> mask_video = [Image.new("L", frame.size, 255) for frame in video] |
There was a problem hiding this comment.
| >>> mask_video = [Image.new("L", frame.size, 255) for frame in video] | |
| >>> mask_video = [PIL.Image.new("L", frame.size, 255) for frame in video] |
Follow up change to #13331 (comment).
| >>> mask_video = [Image.new("L", frame.size, 255) for frame in video] | ||
| >>> prompt = "A cinematic mountain hike with dramatic lighting." | ||
| >>> output = pipe(prompt=prompt, video=video, mask_video=mask_video, output_type="pt").frames[0] |
There was a problem hiding this comment.
| >>> output = pipe(prompt=prompt, video=video, mask_video=mask_video, output_type="pt").frames[0] | |
| >>> output = pipe(prompt=prompt, video=video, mask_video=mask_video, output_type="np").frames[0] |
export_to_video only accepts list[np.ndarray] or list[PIL.Image.Image] video arguments.
|
|
||
| return prompt_embeds, negative_prompt_embeds | ||
|
|
||
| def _preprocess_mask_video(self, mask_video, height: int, width: int) -> torch.Tensor: |
There was a problem hiding this comment.
| def _preprocess_mask_video(self, mask_video, height: int, width: int) -> torch.Tensor: | |
| @staticmethod | |
| def _preprocess_mask_video(mask_video, height: int, width: int) -> torch.Tensor: |
Since _preprocess_mask_video does not use the self argument
| videos = [] | ||
| for i in range(video.size(0)): | ||
| video_bs = video[i : i + 1] | ||
| video_bs = self.vae.encode(video_bs)[0] | ||
| video_bs = video_bs.sample() | ||
| videos.append(video_bs) | ||
| video = torch.cat(videos, dim=0) |
There was a problem hiding this comment.
I think we should encode the video once instead of batch element by batch element, as this would better respect sliced/tiled encoding settings on the VAE. Users can always specify tiled encoding via pipe.vae.enable_tiling.
|
The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update. |
| if return_noise: | ||
| outputs += (noise,) | ||
|
|
||
| if return_video_latents: | ||
| outputs += (video_latents,) |
There was a problem hiding this comment.
I think we should remove the return_noise and return_video_latents arguments because these extra outputs are not used by CogVideoXFunInpaintPipeline.
| mask_pixel_values = [] | ||
| for i in range(masked_image.size(0)): | ||
| mask_pixel_value = masked_image[i].unsqueeze(0) | ||
| mask_pixel_value = self.vae.encode(mask_pixel_value)[0] | ||
| mask_pixel_value = mask_pixel_value.mode() | ||
| mask_pixel_values.append(mask_pixel_value) | ||
| masked_image_latents = torch.cat(mask_pixel_values, dim=0) |
There was a problem hiding this comment.
Similar to #13331 (comment), I think we should encode masked_image once here to respect any slicing/tiling enabled on the VAE.
| else: | ||
| masked_image_latents = None | ||
|
|
||
| return mask, masked_image_latents |
There was a problem hiding this comment.
| return mask, masked_image_latents | |
| return masked_image_latents |
I think we should remove the mask return value here as it is not used in __call__.
| if video is not None and latents is not None: | ||
| raise ValueError("Only one of `video` or `latents` should be provided.") | ||
|
|
||
| def fuse_qkv_projections(self) -> None: |
There was a problem hiding this comment.
I think we should remove the fuse_qkv_projections/unfuse_qkv_projections pipeline methods in favor of calling the corresponding methods directly on the transformer (e.g. pipe.transformer.fuse_qkv_projections()), similar to what we do for VAE slicing/tiling methods. CC @yiyixuxu
| def interrupt(self): | ||
| return self._interrupt | ||
|
|
||
| def get_timesteps(self, num_inference_steps, timesteps, strength, device): |
There was a problem hiding this comment.
I think the code would be more clear if we inlined the logic in the get_timesteps method in __call__. We generally prefer inlining smaller methods into the main pipeline code.
| width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial | ||
| num_frames = num_frames or self.transformer.config.sample_frames | ||
|
|
||
| num_videos_per_prompt = 1 |
There was a problem hiding this comment.
I think we should support num_videos_per_prompt > 1 unless there is a strong reason not to.
dg845
left a comment
There was a problem hiding this comment.
Thanks for the PR! I left an initial review :). When I run the tests locally with
pytest tests/pipelines/cogvideo/test_cogvideox_fun_inpaint.pya lot of the tests fail with the following error:
FAILED tests/pipelines/cogvideo/test_cogvideox_fun_inpaint.py::CogVideoXFunInpaintPipelineFastTests::test_inference_batch_consistent - RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
Can you look into this?
Added CogvideoxFunInpaintPipeline
Before submitting
documentation guidelines, and
here are tips on formatting docstrings.
Who can review?
@sayakpaul