import Button from '@material-ui/core/Button';
import Divider from '@material-ui/core/Divider';
import Grid from '@material-ui/core/Grid';
import Tooltip from '@material-ui/core/Tooltip';
import Paper from '@material-ui/core/Paper';
import Typography from '@material-ui/core/Typography';
import Link from '@material-ui/core/Link';
import FileCopyOutlinedIcon from '@material-ui/icons/FileCopyOutlined';
import IconButton from '@material-ui/core/IconButton';
import { withStyles } from '@material-ui/core/styles';
import * as React from 'react';

import SideNavPub from '../../components/SideNavPub';
import { Styles, styles } from './OctMAEContainer.styles';

class OctMAEContainer extends React.Component<Styles> {

  componentDidMount = () => {
    document.title = 'Zero-Shot Multi-Object Scene Completion';
  }

  render = () => {
    const {
      classes,
    } = this.props;

    const citation = `@InProceedings{Iwase_ECCV_2024,
  author = {Iwase, Shun and, Liu, Katherine and Guizilini, Vitor and Gaidon, Adrien and Kitani, Kris and Ambruș, Rareș and Zakharov, Sergey},
  title = {Zero-Shot Multi-Object Scene Completion},
  booktitle = {ECCV},
  year = {2024}
}`

    const onClickCopyButton = () => {
      navigator.clipboard.writeText(citation);
    }

    return (
      <Paper elevation={0}>
        <Grid item xs container className={classes.paperContainer}>
          <Grid item md={12} lg={12} xl={12} className={classes.sideNavContainer}>
            <SideNavPub />
          </Grid>
          <Grid item md={8} lg={6} xl={4} className={classes.paperContextWrapper}>
            <div className={classes.paperContext}>
              <Typography className={classes.paperTitle}>
                Zero-Shot Multi-Object Scene Completion
              </Typography>
              <Typography className={classes.authorName}>
                <Link className={classes.authorLink} href="https://www.sh8.io" target="_blank">Shun Iwase<sup>1,2</sup></Link>&emsp;
                <Link className={classes.authorLink} href="https://www.thekatherineliu.com/" target="_blank">Katherine Liu<sup>2</sup></Link>&emsp;
                <Link className={classes.authorLink} href="https://vitorguizilini.weebly.com/" target="_blank">Vitor Guizilini<sup>2</sup></Link>&emsp;
                <Link className={classes.authorLink} href="https://adriengaidon.com/" target="_blank">Adrien Gaidon<sup>2</sup></Link>&emsp;
                <Link className={classes.authorLink} href="https://kriskitani.github.io/" target="_blank">Kris Kitani<sup>1</sup></Link>&emsp;
                <Link className={classes.authorLink} href="https://www.tri.global/about-us/dr-rares-ambrus" target="_blank">Rareș Ambruș<sup>2</sup></Link>&emsp;
                <Link className={classes.authorLink} href="https://zakharos.github.io/" target="_blank">Sergey Zakharov<sup>2</sup></Link>&emsp;
              </Typography>
              <Typography className={classes.affiliationName}>
                <span className={classes.affiliation}><sup>1</sup>Carnegie Mellon University</span>
                <span className={classes.affiliation}><sup>2</sup>Toyota Research Institute</span>
              </Typography>
              <Typography className={classes.conferenceName}>
                <i>ECCV 2024</i>
              </Typography>
              <Link href="https://arxiv.org/abs/2403.14628" className={classes.linkStyle} target="_blank">
                <Button variant="outlined" className={classes.buttonPaperStyle} color="primary">
                  Paper
                </Button>
              </Link>
              <Link href="https://github.com/TRI-ML/OctMAE" className={classes.linkStyle} target="_blank"> 
                <Button variant="outlined" className={classes.buttonStyle} color="secondary">
                  Code&Dataset
                </Button>
              </Link>
              <Link href="https://youtu.be/DTEU3coeFlU" className={classes.linkStyle} target="_blank"> 
                <Button variant="outlined" className={classes.buttonVideoStyle} color="primary">
                  Full Presentation
                </Button>
              </Link>
              <Divider className={classes.dividerStyle} />
              <Typography className={classes.subtitle}>
                Abstract
              </Typography>
              <Grid item>
                <Typography className={classes.content}> 
                We present a 3D scene completion method that recovers the complete geometry of multiple unseen objects in complex scenes from a single RGB-D image. Despite notable advancements in single-object 3D shape completion, high-quality reconstructions in highly cluttered real-world multi-object scenes remains a challenge. To address this issue, we propose OctMAE, an architecture that leverages an Octree U-Net and a latent 3D MAE to achieve high-quality and near real-time multi-object scene completion through both local and global geometric reasoning. Because a naive 3D MAE can be computationally intractable and memory intensive even in the latent space, we introduce a novel occlusion masking strategy and adopt 3D rotary embeddings, which significantly improves the runtime and scene completion quality. To generalize to a wide range of objects in diverse scenes, we create a large-scale photorealistic dataset, featuring a diverse set of 12K 3D object models from the Objaverse dataset which are rendered in multi-object scenes with physics-based positioning. Our method outperforms the current state-of-the-art on both synthetic and real-world datasets and demonstrates a strong zero-shot capability.
                </Typography>
              </Grid>
              <Divider className={classes.dividerStyle} />
              <Typography className={classes.subtitle}>
                Overview
              </Typography>
              <Typography className={classes.content}>
              Our goal is to reconstruct multiple novel objects from a single RGB-D image fast and accurately without time-consuming test-time optimization. Toward this goal, we aim to develop a shape completion framework which can better approximate the shape distribution of diverse and large-scale 3D objects.
              </Typography>
              <Grid item>
                <img
                  className={classes.imageStyle}
                  alt='high_overview.img'
                  src={require('../../assets/oct_mae/high_overview.png')}
                />
              </Grid>
              <Divider className={classes.dividerStyle} />
              <Typography className={classes.subtitle}>
                Network Architecture
              </Typography>
              <Typography className={classes.content}>
              To this end, we propose OctMAE, a hybrid architecture of Octree U-Net and masked autoencoders (MAE). This figure illustrates the network architecture of our proposed method. Our key insight is latent 3D MAE can capture global structures better than CNNs. We are going to demonstrate the strong performance and generalization of our methods through 3D visualizations after describing each component step by step. Octree feature aggregation is fairly simple. First, a given RGB image is encoded by the pre-trained image encoder E such as ResNext. Next, the octree feature F is obtained by lifting up 2D image features from the 2D to 3D space using a depth map D and camera intrinsics. In OctMAE, the latent octree feature of the visible surfaces is computed by Octree encoder and latent 3D MAE encoder. Next, the mask tokens are added to the encoded latent feature and they are decoded together using the latent 3D MAE decoder. The key idea is that we only place the tokens in occluded regions. Although it’s a simple modification, the proposed occlusion masking significantly boost the performance and runtime of the latent 3D MAE, compared to dense masking. Finally, the Octree decoder predicts the completed 3D shape at each LoD. For memory and computational efficiency, we get rid of voxels predicted as empty while upsampling the features.
              </Typography>
              <Grid item>
                <img
                  className={classes.imageStyle}
                  alt='network_architecture.img'
                  src={require('../../assets/oct_mae/network_architecture.png')}
                />
              </Grid>
              <Divider className={classes.dividerStyle} />
              <Typography className={classes.subtitle}>
                Dataset
              </Typography>
              <Typography className={classes.content}>
              To train the baselines and the proposed network, we create a new diverse and large-scale synthetic dataset with 1M RGB-D images rendered by BlenderProc. The objects are selected from 601 LVIS categories in Objaverse dataset.
              </Typography>
              <Grid item>
                <img
                  className={classes.imageStyle}
                  alt='dataset.img'
                  src={require('../../assets/oct_mae/dataset.png')}
                />
              </Grid>
              <Divider className={classes.dividerStyle} />
              <Typography className={classes.subtitle}>
                Results
              </Typography>
              <Typography className={classes.content}>
              We show the results on real-world datasets. Unlike the synthetic dataset, the real-world depth measurements are more noisy and erroneous, however, we observe that our method can generate faithful and consistent 3D shapes on different types of objects. Moreover, you can find that our method shows a strong zero-shot capability even for unique shape objects.
              </Typography>
              <Grid item>
                <img
                  className={classes.imageStyle}
                  alt='results_hb.img'
                  src={require('../../assets/oct_mae/results_hb.png')}
                />
              </Grid>
              <Grid item>
                <img
                  className={classes.imageStyle}
                  alt='results_hope.img'
                  src={require('../../assets/oct_mae/results_hope.png')}
                />
              </Grid>
              <Divider className={classes.dividerStyle} />
              <Typography className={classes.subtitle}>
                Full Presentation
              </Typography>
              <div className={classes.videoContainerStyle}>
                <Grid item>
                  <iframe className={classes.videoIframe} src="https://www.youtube.com/embed/DTEU3coeFlU"></iframe>
                </Grid>
              </div>
              <Divider className={classes.dividerStyle} />
              <Typography className={classes.subtitle}>
                Citation
              </Typography>
              <div className={classes.citationContainerStyle}>
                <Tooltip title="Copy" placement="left" arrow>
                  <IconButton className={classes.citationCopyButtonStyle} onClick={onClickCopyButton} >
                    <FileCopyOutlinedIcon/>
                  </IconButton>
                </Tooltip>
                <div className={classes.citationStyle}>
                  <pre>
                    {citation}
                  </pre>
                </div>
              </div>
            </div>
          </Grid>
        </Grid>
      </Paper>
    );
  }
}

export default withStyles(styles)(OctMAEContainer);
