This repository has been archived by the owner on May 28, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsegmentation.html
128 lines (98 loc) · 12.8 KB
/
segmentation.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<!-- Begin Jekyll SEO tag v2.5.0 -->
<title>Segmentation | Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges</title>
<meta name="generator" content="Jekyll v3.7.4" />
<meta property="og:title" content="Segmentation" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="Di Feng, Christian Haase-Schuetz, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer <p> Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology <p> * Contributed equally" />
<meta property="og:description" content="Di Feng, Christian Haase-Schuetz, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer <p> Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology <p> * Contributed equally" />
<link rel="canonical" href="http://boschresearch.github.io/multimodalperception/segmentation.html" />
<meta property="og:url" content="http://boschresearch.github.io/multimodalperception/segmentation.html" />
<meta property="og:site_name" content="Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges" />
<script type="application/ld+json">
{"url":"http://boschresearch.github.io/multimodalperception/segmentation.html","headline":"Segmentation","description":"Di Feng, Christian Haase-Schuetz, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer <p> Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology <p> * Contributed equally","@type":"WebPage","@context":"http://schema.org"}</script>
<!-- End Jekyll SEO tag -->
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#FF4747">
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
<link rel="stylesheet" href="assets/css/style.css?v=">
</head>
<body>
<a id="skip-to-content" href="#content">Skip to the content.</a>
<header class="page-header" role="banner">
<h3 class="project-name">Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges</h3>
<h4 class="project-tagline">Di Feng*, Christian Haase-Schuetz*, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer <p> Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology <p> * Contributed equally</h4>
</header>
<main id="content" class="main-content" role="main">
<h1 id="segmentation">Segmentation</h1>
<p><a id="bck" href="index.html#introtab"><b>Back to index</b></a>
<a href="segmentation/segmentation_2d.html#bck"><img src="img/2D.png" alt="2D" width="50" /></a>
<a href="segmentation/segmentation_3d.html#bck"><img src="img/3D.png" alt="3D" width="50" /></a>
<a href="segmentation/segmentation_thermal.html#bck"><img src="img/Thermal.png" alt="Thermal" width="50" /></a>
<a href="segmentation/segmentation_lidar.html#bck"><img src="img/LiDAR.png" alt="LiDAR" width="50" /></a></p>
<table id="commontab">
<tr><th id="segmentation"> Reference </th><th id="segmentation"> Sensors </th><th id="segmentation"> Semantics </th><th id="segmentation"> Sensing Modality Representations </th><th id="segmentation"> Fusion Operation and Method </th><th id="segmentation"> Fusion Level </th><th> Dataset(s) used </th></tr>
<tr><td valign="top">Chen <i>et al.</i>, 2019
<a href="https://arxiv.org/pdf/1904.01206">[pdf]</a><a href="./ref/chen2019progressive.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> Road segmentation </td><td valign="top"> RGB image, altitude difference image. Each processed by a CNN </td><td valign="top"> Feature adaptation module, modified concatenation. </td><td valign="top"> Middle </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top">Valada <i>et al.</i>, 2019
<a href="https://arxiv.org/pdf/1808.03833">[pdf]</a><a href="./ref/valada2018self.bib">[ref]</a>
</td><td valign="top"> Visual camera, depth camera, thermal camera </td><td valign="top"> Multiple 2D objects </td><td valign="top"> RGB image, thermal image, depth image. Each processed by FCN with ResNet backbone (Adapnet++ architecture) </td><td valign="top"> Extension of Mixture of Experts </td><td valign="top"> Middle </td><td valign="top"> Six datasets, including Cityscape, Sun RGB-D, etc. </td></tr>
<tr><td valign="top">Sun <i>et al.</i>, 2019
<a href="https://ieeexplore.ieee.org/abstract/document/8666745">[pdf]</a><a href="./ref/sun2019rtfnet.bib">[ref]</a>
</td><td valign="top"> Visual camera, thermal camera </td><td valign="top"> Multiple 2D objects in campus environments </td><td valign="top"> RGB image, thermal image. Each processed by a base network built on ResNet </td><td valign="top"> Element-wise summation in the encoder networks </td><td valign="top"> Middle </td><td valign="top"> Datasets published by <a href="../ref/ha2017mfnet.bib">[ref]</a> </td></tr>
<tr><td valign="top"> Caltagirone <i>et al.</i>, 2019
<a href="https://arxiv.org/pdf/1809.07941">[pdf]</a><a href="./ref/caltagirone2019lidar.bib">[ref]</a>
</td><td valign="top"> LiDAR, vision camera </td><td valign="top"> Road segmentation </td><td valign="top"> LiDAR front-view depth images, RGB image. Each input processed by a FCN </td><td valign="top"> Feature concatenation (For early and late fusion), weighted addition similar to gating network (for middle-level cross fusion) </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top">Erkent <i>et al.</i>, 2018
<a href="https://ieeexplore.ieee.org/abstract/document/8593434">[pdf]</a><a href="./ref/erkent2018semantic.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> Multiple 2D objects </td><td valign="top"> LiDAR BEV occupancy grids (processed based on Bayesian filtering and tracking), RGB image (processed by a FCN with VGG16 backbone) </td><td valign="top"> Feature concatenation </td><td valign="top"> Middle </td><td valign="top"> KITTI, self-recorded </td></tr>
<tr><td valign="top"> Lv <i>et al.</i>, 2018
<a href="https://ieeexplore.ieee.org/abstract/document/8500551/">[pdf]</a><a href="./ref/lv2018novel.bib">[ref]</a>
</td><td valign="top"> LiDAR, vision camera </td><td valign="top"> Road segmentation </td><td valign="top"> LiDAR BEV maps, RGB image. Each input processed by a FCN with dilated convolution operator. RGB image features are alo projected onto LiDAR BEV plane before fusion </td><td valign="top"> Feature concatenation </td><td valign="top"> Middle </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Wulff <i>et al.</i>, 2018
<a href="https://ieeexplore.ieee.org/abstract/document/8500549">[pdf]</a><a href="./ref/wulff2018early.bib">[ref]</a>
</td><td valign="top"> LiDAR, vision camera </td><td valign="top"> Road segmentation. Alternatives: freespace, ego-lane detection </td><td valign="top"> LiDAR BEV maps, RGB image projected onto BEV plane. Inputs processed by a FCN with UNet </td><td valign="top"> Feature concatenation </td><td valign="top"> Early </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Kim <i>et al.</i>, 2018
<a href="https://arxiv.org/abs/1807.06233">[pdf]</a><a href="./ref/kim2018robust.bib">[ref]</a>
</td><td valign="top"> LiDAR, vision camera </td><td valign="top"> 2D Off-road terrains </td><td valign="top"> LiDAR voxel (processed by 3D convolution), RGB image (processed by ENet) </td><td valign="top"> Addition </td><td valign="top"> Early, Middle, Late </td><td valign="top"> self-recorded </td></tr>
<tr><td valign="top"> Guan <i>et al.</i>, 2018
<a href="https://arxiv.org/abs/1802.09972">[pdf]</a><a href="./ref/guan2018fusion.bib">[ref]</a>
</td><td valign="top"> Vision camera, thermal camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> RGB image, thermal image. Each processed by a base network built on VGG16 </td><td valign="top"> Feature concatenation, Mixture of Experts </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KAIST Pedestrian Dataset </td></tr>
<tr><td valign="top"> Yang <i>et al.</i>, 2018
<a href="https://ieeexplore.ieee.org/abstract/document/8428696/">[pdf]</a><a href="./ref/yang2018fusion.bib">[ref]</a>
</td><td valign="top"> LiDAR, vision camera </td><td valign="top"> Road segmentation </td><td valign="top"> LiDAR points (processed by PointNet++), RGB image (processed by FCN with VGG16 backbone) </td><td valign="top"> Optimizing Conditional Random Field (CRF) </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top">Gu <i>et al.</i>, 2018
<a href="https://ieeexplore.ieee.org/abstract/document/8370690">[pdf]</a><a href="./ref/gu20183d.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> Road segmentation </td><td valign="top"> LiDAR front-view depth and height maps (processed by a inverse-depth histogram based line scanning strategy), RGB image (processed by a FCN). </td><td valign="top"> Optimizing Conditional Random Field </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top">Cai <i>et al.</i>, 2018
<a href="https://www.mdpi.com/1424-8220/18/12/4158">[pdf]</a><a href="./ref/cai2018robust.bib">[ref]</a>
</td><td valign="top"> Satellite map with route information, visual camera </td><td valign="top"> Road segmentation </td><td valign="top"> Route map image, RGB image. Images are fused and processed by a FCN </td><td valign="top"> Overlaying the line and curve segments in the route map onto the RGB image to generate the Map Fusion Image (MFI) </td><td valign="top"> Early </td><td valign="top"> self-recorded data </td></tr>
<tr><td valign="top"> Ha <i>et al.</i>, 2017
<a href="https://ieeexplore.ieee.org/abstract/document/8206396/">[pdf]</a><a href="./ref/ha2017mfnet.bib">[ref]</a>
</td><td valign="top"> Vision camera, thermal camera </td><td valign="top"> Multiple 2D objects in campus environments </td><td valign="top"> RGB image, thermal image. Each processed by a FCN and mini-inception block </td><td valign="top"> Feature concatenation, addition (``short-cut fusion'') </td><td valign="top"> Middle </td><td valign="top"> self-recorded data </td></tr>
<tr><td valign="top"> Valada <i>et al.</i>, 2017
<a href="http://ais.informatik.uni-freiburg.de/publications/papers/valada17icraa.pdf">[pdf]</a><a href="./ref/valada2017adapnet.bib">[ref]</a>
</td><td valign="top"> Vision camera, thermal camera </td><td valign="top"> Multiple 2D objects </td><td valign="top"> RGB image, thermal image, depth image. Each processed by FCN with ResNet backbone </td><td valign="top"> Mixture of Experts </td><td valign="top"> Late </td><td valign="top"> Cityscape, Freiburg Multispectral Dataset, Synthia </td></tr>
<tr><td valign="top"> Schneider <i>et al.</i>, 2017
<a href="https://arxiv.org/pdf/1707.03167">[pdf]</a><a href="./ref/schneider2017regnet.bib">[ref]</a>
</td><td valign="top"> Vision camera </td><td valign="top"> Multiple 2D Objects </td><td valign="top"> RGB image, depth image </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Middle, Late </td><td valign="top"> Cityscape </td></tr>
<tr><td valign="top"> Schneider <i>et al.</i>, 2017
<a href="https://arxiv.org/pdf/1707.03167">[pdf]</a><a href="./ref/schneider2017regnet.bib">[ref]</a>
</td><td valign="top"> Vision camera </td><td valign="top"> Multiple 2D Objects </td><td valign="top"> RGB image (processed by GoogLeNet), depth image from stereo camera (processed by NiN net) </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Middle, Late </td><td valign="top"> Cityscape </td></tr>
<tr><td valign="top"> Valada <i>et al.</i>, 2016
<a href="https://link.springer.com/chapter/10.1007/978-3-319-50115-4_41">[pdf]</a><a href="./ref/valada2016deep.bib">[ref]</a>
</td><td valign="top"> Vision camera, thermal camera </td><td valign="top"> Multiple 2D objects in forested environments </td><td valign="top"> RGB image, thermal image, depth image. Each processed by the UpNet (built on VGG16 and up-convolution) </td><td valign="top"> Feature concatenation, addition </td><td valign="top"> Early, Late </td><td valign="top"> self-recorded data </td></tr>
</table>
<footer class="site-footer">
<span class="site-footer-credits">
(c) Robert Bosch GmbH 2019. All rights reserved. <a href="http://www.bosch.com/research">www.bosch.com/research</a>.
<p></p>
This page was generated by <a href="https://pages.github.com">GitHub Pages</a>.</span>
</footer>
</main>
</body>
</html>