Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Crop Segementation local training error & AML training error #67

Open
Amr-MKamal opened this issue Mar 10, 2023 · 13 comments
Open

Crop Segementation local training error & AML training error #67

Amr-MKamal opened this issue Mar 10, 2023 · 13 comments
Assignees
Labels
notebooks Issues encountered while running the notebooks

Comments

@Amr-MKamal
Copy link

Amr-MKamal commented Mar 10, 2023

I've been working my way around crop-segmentation notebook for a while , now I'm finally at the training stage , however I get errors for both local & AML traninging , for local training this what I get in[20] after running trainer.fit(model,data):

AssertionError                            Traceback (most recent call last)
Cell In[20], line 7
      3     model = SegmentationModel.load_from_checkpoint(CHPT_PATH)
      4 else:
      5     # Train it now
      6     #nn.Module
----> 7     trainer.fit(model,data)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:696, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    677 r"""
    678 Runs the full optimization routine.
    679 
   (...)
    693     datamodule: An instance of :class:`~pytorch_lightning.core.datamodule.LightningDataModule`.
    694 """
    695 self.strategy.model = model
--> 696 self._call_and_handle_interrupt(
    697     self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    698 )

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:650, in Trainer._call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
    648         return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
    649     else:
--> 650         return trainer_fn(*args, **kwargs)
    651 # TODO(awaelchli): Unify both exceptions below, where `KeyboardError` doesn't re-raise
    652 except KeyboardInterrupt as exception:

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:735, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    731 ckpt_path = ckpt_path or self.resume_from_checkpoint
    732 self._ckpt_path = self.__set_ckpt_path(
    733     ckpt_path, model_provided=True, model_connected=self.lightning_module is not None
    734 )
--> 735 results = self._run(model, ckpt_path=self.ckpt_path)
    737 assert self.state.stopped
    738 self.training = False

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1166, in Trainer._run(self, model, ckpt_path)
   1162 self._checkpoint_connector.restore_training_state()
   1164 self._checkpoint_connector.resume_end()
-> 1166 results = self._run_stage()
   1168 log.detail(f"{self.__class__.__name__}: trainer tearing down")
   1169 self._teardown()

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1252, in Trainer._run_stage(self)
   1250 if self.predicting:
   1251     return self._run_predict()
-> 1252 return self._run_train()

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1283, in Trainer._run_train(self)
   1280 self.fit_loop.trainer = self
   1282 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1283     self.fit_loop.run()

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py:200, in Loop.run(self, *args, **kwargs)
    198 try:
    199     self.on_advance_start(*args, **kwargs)
--> 200     self.advance(*args, **kwargs)
    201     self.on_advance_end()
    202     self._restarting = False

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:271, in FitLoop.advance(self)
    267 self._data_fetcher.setup(
    268     dataloader, batch_to_device=partial(self.trainer._call_strategy_hook, "batch_to_device", dataloader_idx=0)
    269 )
    270 with self.trainer.profiler.profile("run_training_epoch"):
--> 271     self._outputs = self.epoch_loop.run(self._data_fetcher)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py:200, in Loop.run(self, *args, **kwargs)
    198 try:
    199     self.on_advance_start(*args, **kwargs)
--> 200     self.advance(*args, **kwargs)
    201     self.on_advance_end()
    202     self._restarting = False

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py:203, in TrainingEpochLoop.advance(self, data_fetcher)
    200     self.batch_progress.increment_started()
    202     with self.trainer.profiler.profile("run_training_batch"):
--> 203         batch_output = self.batch_loop.run(kwargs)
    205 self.batch_progress.increment_processed()
    207 # update non-plateau LR schedulers
    208 # update epoch-interval ones only when we are at the end of training epoch

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py:200, in Loop.run(self, *args, **kwargs)
    198 try:
    199     self.on_advance_start(*args, **kwargs)
--> 200     self.advance(*args, **kwargs)
    201     self.on_advance_end()
    202     self._restarting = False

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py:87, in TrainingBatchLoop.advance(self, kwargs)
     83 if self.trainer.lightning_module.automatic_optimization:
     84     optimizers = _get_active_optimizers(
     85         self.trainer.optimizers, self.trainer.optimizer_frequencies, kwargs.get("batch_idx", 0)
     86     )
---> 87     outputs = self.optimizer_loop.run(optimizers, kwargs)
     88 else:
     89     outputs = self.manual_loop.run(kwargs)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py:200, in Loop.run(self, *args, **kwargs)
    198 try:
    199     self.on_advance_start(*args, **kwargs)
--> 200     self.advance(*args, **kwargs)
    201     self.on_advance_end()
    202     self._restarting = False

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:201, in OptimizerLoop.advance(self, optimizers, kwargs)
    198 def advance(self, optimizers: List[Tuple[int, Optimizer]], kwargs: OrderedDict) -> None:  # type: ignore[override]
    199     kwargs = self._build_kwargs(kwargs, self.optimizer_idx, self._hiddens)
--> 201     result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
    202     if result.loss is not None:
    203         # automatic optimization assumes a loss needs to be returned for extras to be considered as the batch
    204         # would be skipped otherwise
    205         self._outputs[self.optimizer_idx] = result.asdict()

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:248, in OptimizerLoop._run_optimization(self, kwargs, optimizer)
    240         closure()
    242 # ------------------------------
    243 # BACKWARD PASS
    244 # ------------------------------
    245 # gradient update with accumulated gradients
    246 else:
    247     # the `batch_idx` is optional with inter-batch parallelism
--> 248     self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
    250 result = closure.consume_result()
    252 if result.loss is not None:
    253     # if no result, user decided to skip optimization
    254     # otherwise update running loss + reset accumulated loss
    255     # TODO: find proper way to handle updating running loss

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:358, in OptimizerLoop._optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
    355     self.optim_progress.optimizer.step.increment_ready()
    357 # model hook
--> 358 self.trainer._call_lightning_module_hook(
    359     "optimizer_step",
    360     self.trainer.current_epoch,
    361     batch_idx,
    362     optimizer,
    363     opt_idx,
    364     train_step_and_backward_closure,
    365     on_tpu=isinstance(self.trainer.accelerator, TPUAccelerator),
    366     using_native_amp=(self.trainer.amp_backend == AMPType.NATIVE),
    367     using_lbfgs=is_lbfgs,
    368 )
    370 if not should_accumulate:
    371     self.optim_progress.optimizer.step.increment_completed()

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1550, in Trainer._call_lightning_module_hook(self, hook_name, pl_module, *args, **kwargs)
   1547 pl_module._current_fx_name = hook_name
   1549 with self.profiler.profile(f"[LightningModule]{pl_module.__class__.__name__}.{hook_name}"):
-> 1550     output = fn(*args, **kwargs)
   1552 # restore current_fx when nested context
   1553 pl_module._current_fx_name = prev_fx_name

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/core/module.py:1674, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs)
   1592 def optimizer_step(
   1593     self,
   1594     epoch: int,
   (...)
   1601     using_lbfgs: bool = False,
   1602 ) -> None:
   1603     r"""
   1604     Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer` calls
   1605     each optimizer.
   (...)
   1672 
   1673     """
-> 1674     optimizer.step(closure=optimizer_closure)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py:168, in LightningOptimizer.step(self, closure, **kwargs)
    165     raise MisconfigurationException("When `optimizer.step(closure)` is called, the closure should be callable")
    167 assert self._strategy is not None
--> 168 step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
    170 self._on_after_step()
    172 return step_output

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py:216, in Strategy.optimizer_step(self, optimizer, opt_idx, closure, model, **kwargs)
    206 """Performs the actual optimizer step.
    207 
    208 Args:
   (...)
    213     **kwargs: Any extra arguments to ``optimizer.step``
    214 """
    215 model = model or self.lightning_module
--> 216 return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:153, in PrecisionPlugin.optimizer_step(self, model, optimizer, optimizer_idx, closure, **kwargs)
    151 if isinstance(model, pl.LightningModule):
    152     closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure)
--> 153 return optimizer.step(closure=closure, **kwargs)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/torch/optim/lr_scheduler.py:65, in _LRScheduler.__init__.<locals>.with_counter.<locals>.wrapper(*args, **kwargs)
     63 instance._step_count += 1
     64 wrapped = func.__get__(instance, cls)
---> 65 return wrapped(*args, **kwargs)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/torch/optim/optimizer.py:113, in Optimizer._hook_for_profile.<locals>.profile_hook_step.<locals>.wrapper(*args, **kwargs)
    111 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
    112 with torch.autograd.profiler.record_function(profile_name):
--> 113     return func(*args, **kwargs)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
     24 @functools.wraps(func)
     25 def decorate_context(*args, **kwargs):
     26     with self.clone():
---> 27         return func(*args, **kwargs)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/torch/optim/adam.py:118, in Adam.step(self, closure)
    116 if closure is not None:
    117     with torch.enable_grad():
--> 118         loss = closure()
    120 for group in self.param_groups:
    121     params_with_grad = []

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:138, in PrecisionPlugin._wrap_closure(self, model, optimizer, optimizer_idx, closure)
    125 def _wrap_closure(
    126     self,
    127     model: "pl.LightningModule",
   (...)
    130     closure: Callable[[], Any],
    131 ) -> Any:
    132     """This double-closure allows makes sure the ``closure`` is executed before the
    133     ``on_before_optimizer_step`` hook is called.
    134 
    135     The closure (generally) runs ``backward`` so this allows inspecting gradients in this hook. This structure is
    136     consistent with the ``PrecisionPlugin`` subclasses that cannot pass ``optimizer.step(closure)`` directly.
    137     """
--> 138     closure_result = closure()
    139     self._after_closure(model, optimizer, optimizer_idx)
    140     return closure_result

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:146, in Closure.__call__(self, *args, **kwargs)
    145 def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]:
--> 146     self._result = self.closure(*args, **kwargs)
    147     return self._result.loss

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:132, in Closure.closure(self, *args, **kwargs)
    131 def closure(self, *args: Any, **kwargs: Any) -> ClosureResult:
--> 132     step_output = self._step_fn()
    134     if step_output.closure_loss is None:
    135         self.warning_cache.warn("`training_step` returned `None`. If this was on purpose, ignore this warning...")

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:407, in OptimizerLoop._training_step(self, kwargs)
    398 """Performs the actual train step with the tied hooks.
    399 
    400 Args:
   (...)
    404     A ``ClosureResult`` containing the training step output.
    405 """
    406 # manually capture logged metrics
--> 407 training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
    408 self.trainer.strategy.post_training_step()
    410 model_output = self.trainer._call_lightning_module_hook("training_step_end", training_step_output)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1704, in Trainer._call_strategy_hook(self, hook_name, *args, **kwargs)
   1701     return
   1703 with self.profiler.profile(f"[Strategy]{self.strategy.__class__.__name__}.{hook_name}"):
-> 1704     output = fn(*args, **kwargs)
   1706 # restore current_fx when nested context
   1707 pl_module._current_fx_name = prev_fx_name

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py:358, in Strategy.training_step(self, *args, **kwargs)
    356 with self.precision_plugin.train_step_context():
    357     assert isinstance(self.model, TrainingStep)
--> 358     return self.model.training_step(*args, **kwargs)

File ~/farmvibes-ai-main/notebooks/crop_segmentation/notebook_lib/models.py:91, in SegmentationModel.training_step(self, batch, batch_idx)
     90 def training_step(self, batch: Dict[str, Any], batch_idx: int) -> Dict[str, Any]:
---> 91     return self._shared_step(batch, batch_idx)

File ~/farmvibes-ai-main/notebooks/crop_segmentation/notebook_lib/models.py:78, in SegmentationModel._shared_step(self, batch, batch_idx)
     76 pred = self(batch["image"])
     77 for t in pred, batch["mask"]:
---> 78     assert torch.all(torch.isfinite(t))
     79 loss = self.loss(pred, batch["mask"])
     81 return {"loss": loss, "preds": pred.detach(), "target": batch["mask"]}

AssertionError: 

And for the AML training , the job fails after submitting it to a compute instant , again it appears to has something to do with inter-package compatibility , the error massage from AML:

Execution failed. User process 'python' exited with status code 1. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):
  File "/mnt/azureml/cr/j/85b9fae7d3cd4fcf936315a8324fde05/exe/wd/aml_train_script.py", line 4, in <module>
    import torch
  File "/azureml-envs/azureml_a197fc75079e2e0b8afc1441914b3e27/lib/python3.10/site-packages/torch/__init__.py", line 217, in <module>
    _load_global_deps()
  File "/azureml-envs/azureml_a197fc75079e2e0b8afc1441914b3e27/lib/python3.10/site-packages/torch/__init__.py", line 178, in _load_global_deps
    _preload_cuda_deps()
  File "/azureml-envs/azureml_a197fc75079e2e0b8afc1441914b3e27/lib/python3.10/site-packages/torch/__init__.py", line 158, in _preload_cuda_deps
    ctypes.CDLL(cublas_path)
  File "/azureml-envs/azureml_a197fc75079e2e0b8afc1441914b3e27/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: /azureml-envs/azureml_a197fc75079e2e0b8afc1441914b3e27/lib/python3.10/site-packages/nvidia/cublas/lib/libcublas.so.11: symbol cublasLtGetStatusString version libcublasLt.so.11 not defined in file libcublasLt.so.11 with link time reference

Could really use help in moving forward from this.

@rafaspadilha rafaspadilha self-assigned this Mar 10, 2023
@rafaspadilha
Copy link
Contributor

Thanks for using FarmVibes.AI and reporting the issue, @Amr-MKamal. I'll investigate this and return to you as soon as possible.

@rafaspadilha rafaspadilha added the notebooks Issues encountered while running the notebooks label Mar 14, 2023
@rafaspadilha
Copy link
Contributor

@Amr-MKamal I couldn't properly reproduce your error.

Are you running for the same region of the notebook (within the Continental USA, where CDL is available)?
Are you using the crop_env.yaml conda environment?

The AssertionError that you are seeing is coming from the SegmentationModel._shared_step() method from notebooks/crop_segmentation/notebook_lib/models.py:

def _shared_step(self, batch: Dict[str, Any], batch_idx: int) -> Dict[str, Any]:
        pred = self(batch["image"])
        for t in pred, batch["mask"]:
            assert torch.all(torch.isfinite(t))
        loss = self.loss(pred, batch["mask"])

        return {"loss": loss, "preds": pred.detach(), "target": batch["mask"]}

That assertion checks if all the values in the mask (in this case, the CDL maps) are defined and finite. This should be the case for the CDL, as the samples generated by CDLMask dataset are the result of a torch.isin operation that returns a boolean tensor.

@Amr-MKamal
Copy link
Author

Amr-MKamal commented Mar 25, 2023

@rafaspadilha , the error was in the old version of this notebook , the error I'm getting now for local training in Section [4] (after trying the exmaple area for 2021-2022):

RuntimeError                              Traceback (most recent call last)
Cell In[4], line 6
      4 plt.figure(figsize=(10, 10))
      5 ax = plt.gca()
----> 6 gpd.GeoSeries([bbox_to_shapely(b) for b in data.train_dataloader().sampler]).boundary.plot(ax=ax, color="C0")
      7 gpd.GeoSeries([bbox_to_shapely(b) for b in data.val_dataloader().sampler]).boundary.plot(ax=ax, color="C1")
      8 gpd.GeoSeries(bbox_to_shapely(train_roi)).boundary.plot(ax=ax, color="black")

Cell In[4], line 6, in <listcomp>(.0)
      4 plt.figure(figsize=(10, 10))
      5 ax = plt.gca()
----> 6 gpd.GeoSeries([bbox_to_shapely(b) for b in data.train_dataloader().sampler]).boundary.plot(ax=ax, color="C0")
      7 gpd.GeoSeries([bbox_to_shapely(b) for b in data.val_dataloader().sampler]).boundary.plot(ax=ax, color="C1")
      8 gpd.GeoSeries(bbox_to_shapely(train_roi)).boundary.plot(ax=ax, color="black")

File ~/farmvibes-ai/notebooks/crop_segmentation/notebook_lib/modules.py:75, in YearRandomGeoSampler.__iter__(self)
     74 def __iter__(self) -> Iterator[BoundingBox]:
---> 75     for bbox in super().__iter__():
     76         yield year_bbox(bbox)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/torchgeo/samplers/single.py:130, in RandomGeoSampler.__iter__(self)
    123 """Return the index of a dataset.
    124 
    125 Returns:
    126     (minx, maxx, miny, maxy, mint, maxt) coordinates to index a dataset
    127 """
    128 for _ in range(len(self)):
    129     # Choose a random tile, weighted by area
--> 130     idx = torch.multinomial(self.areas, 1)
    131     hit = self.hits[idx]
    132     bounds = BoundingBox(*hit.bounds)

RuntimeError: cannot sample n_sample > prob_dist.size(-1) samples without replacement`

@rafaspadilha
Copy link
Contributor

rafaspadilha commented Mar 28, 2023

This error seems to happen because the RandomGeoSampler is not able to sample chips (i.e., smaller image regions that will be used as training data for the segmentation model) within the input NDVI or CDL rasters. This may happen if you input region is very small or the chip size is too big.

Are you using the same region of the notebook or have you decreased the size of the input geometry?
If so, you may want to alter the parameter img_size of the CropSegDataModule (please, refer to the class definition).

Please, let me know if that fixes you issue.

@Amr-MKamal
Copy link
Author

Amr-MKamal commented Mar 28, 2023

I used the same input_region provided in the file example POLYGON((-118.83884470335134 46.135858606707956,-119.59905735755602 46.135858606707956,-119.59356419349352 46.759102248950285,-119.20796470232867 46.75533893062969,-118.84433786741384 46.759102248950285,-118.83884470335134 46.135858606707956)) , but for the dates 2021 & 2022.

The chip size & other related training parameters I also didn't change :

CHIP_SIZE = 256
EPOCH_SIZE = 1024
BATCH_SIZE = 16
NDVI_STACK_BANDS = 37
NUM_WORKERS = 2  # Change this depending on available memory and number of cores
VAL_RATIO = 0.2 # Ratio of the validation subset of the input region
# Training hyperparameters
LR = 1e-3  # Learning rate
WD = 1e-2  # Weight decay
MAX_EPOCHS = 10  # How many epochs to train for

I will try decreasing the chip size/img_size to 128 & tell you how that goes.

@Amr-MKamal
Copy link
Author

Amr-MKamal commented Mar 30, 2023

I tried it down to only CHIP_SIZE = 1 & I still git the same error , minimizing this parameters alone or together doesn't solve the error @rafaspadilha

@rafaspadilha
Copy link
Contributor

I see. Please, could you check for me:

  1. How many NDVI and CDL rasters are you passing as input to CropSegDataModule ? Could you run len(ndvi_rasters) and len(cdl_rasters) to check that?
  2. Did you change the positive_indices parameter of CropSegDataModule ?

@Amr-MKamal
Copy link
Author

len(ndvi_rasters) result is : 330
& len(cdl_rasters) is : 1
However I noticed from the Crop Segmentation model documentation that both train_years & val_years are defaulted to 2020 , I assumed that running a different datetime (2021) will automatically update this , however that was not the case
after editing crop segmentation model as following the local training model worked successfully :
data = CropSegDataModule( ndvi_rasters, cdl_rasters, ndvi_stack_bands=NDVI_STACK_BANDS, img_size=(CHIP_SIZE, CHIP_SIZE), epoch_size=EPOCH_SIZE, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, positive_indices=constants.CROP_INDICES, val_ratio=VAL_RATIO, train_years=[2021], val_years=[2021], )

@Amr-MKamal
Copy link
Author

However now I still get the assertion error in 6 :\

@rafaspadilha
Copy link
Contributor

Hey, @Amr-MKamal. Yes, train_years and val_years default to 2020 in the CropSegDataModule. I will update the notebook in the next release to make it clearer that these parameters should be updated accordingly. I'm sorry for that.

Are you still having RuntimeError: cannot sample n_sample > prob_dist.size(-1) samples without replacement ?
Did you change anything from your previous run that worked successfully?

@Amr-MKamal
Copy link
Author

@rafaspadilha Thank you , no I'm getting the same assertion error I got at the beginning in Cell [6]
`

| Name | Type | Params

0 | model | FPN | 23.3 M
1 | loss | BCEWithLogitsLoss | 0
2 | train_metrics | MetricCollection | 0
3 | val_metrics | MetricCollection | 0

23.3 M Trainable params
0 Non-trainable params
23.3 M Total params
93.048 Total estimated model params size (MB)

Converting CDLMask CRS from EPSG:5070 to EPSG:32611
Converting CDLMask resolution from 30.0 to 10.0

Sanity Checking DataLoader 0: 0%
0/2 [00:00<?, ?it/s]


AssertionError Traceback (most recent call last)
Cell In[6], line 6
3 model = SegmentationModel.load_from_checkpoint(CHPT_PATH)
4 else:
5 # Train it now
----> 6 trainer.fit(model, data)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:696, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
677 r"""
678 Runs the full optimization routine.
679
(...)
693 datamodule: An instance of :class:~pytorch_lightning.core.datamodule.LightningDataModule.
694 """
695 self.strategy.model = model
--> 696 self._call_and_handle_interrupt(
697 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
698 )

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:650, in Trainer._call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
648 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
649 else:
--> 650 return trainer_fn(*args, **kwargs)
651 # TODO(awaelchli): Unify both exceptions below, where KeyboardError doesn't re-raise
652 except KeyboardInterrupt as exception:

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:735, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
731 ckpt_path = ckpt_path or self.resume_from_checkpoint
732 self._ckpt_path = self.__set_ckpt_path(
733 ckpt_path, model_provided=True, model_connected=self.lightning_module is not None
734 )
--> 735 results = self._run(model, ckpt_path=self.ckpt_path)
737 assert self.state.stopped
738 self.training = False

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1166, in Trainer._run(self, model, ckpt_path)
1162 self._checkpoint_connector.restore_training_state()
1164 self._checkpoint_connector.resume_end()
-> 1166 results = self._run_stage()
1168 log.detail(f"{self.class.name}: trainer tearing down")
1169 self._teardown()

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1252, in Trainer._run_stage(self)
1250 if self.predicting:
1251 return self._run_predict()
-> 1252 return self._run_train()

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1274, in Trainer._run_train(self)
1271 self._pre_training_routine()
1273 with isolate_rng():
-> 1274 self._run_sanity_check()
1276 # enable train mode
1277 self.model.train()

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1343, in Trainer._run_sanity_check(self)
1341 # run eval step
1342 with torch.no_grad():
-> 1343 val_loop.run()
1345 self._call_callback_hooks("on_sanity_check_end")
1347 # reset logger connector

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py:200, in Loop.run(self, *args, **kwargs)
198 try:
199 self.on_advance_start(*args, **kwargs)
--> 200 self.advance(*args, **kwargs)
201 self.on_advance_end()
202 self._restarting = False

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py:155, in EvaluationLoop.advance(self, *args, **kwargs)
153 if self.num_dataloaders > 1:
154 kwargs["dataloader_idx"] = dataloader_idx
--> 155 dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
157 # store batch level output per dataloader
158 self._outputs.append(dl_outputs)

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py:200, in Loop.run(self, *args, **kwargs)
198 try:
199 self.on_advance_start(*args, **kwargs)
--> 200 self.advance(*args, **kwargs)
201 self.on_advance_end()
202 self._restarting = False

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py:143, in EvaluationEpochLoop.advance(self, data_fetcher, dl_max_batches, kwargs)
140 self.batch_progress.increment_started()
142 # lightning module methods
--> 143 output = self._evaluation_step(**kwargs)
144 output = self._evaluation_step_end(output)
146 self.batch_progress.increment_processed()

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py:240, in EvaluationEpochLoop._evaluation_step(self, **kwargs)
229 """The evaluation step (validation_step or test_step depending on the trainer's state).
230
231 Args:
(...)
237 the outputs of the step
238 """
239 hook_name = "test_step" if self.trainer.testing else "validation_step"
--> 240 output = self.trainer._call_strategy_hook(hook_name, *kwargs.values())
242 return output

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1704, in Trainer._call_strategy_hook(self, hook_name, *args, **kwargs)
1701 return
1703 with self.profiler.profile(f"[Strategy]{self.strategy.class.name}.{hook_name}"):
-> 1704 output = fn(*args, **kwargs)
1706 # restore current_fx when nested context
1707 pl_module._current_fx_name = prev_fx_name

File ~/anaconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py:370, in Strategy.validation_step(self, *args, **kwargs)
368 with self.precision_plugin.val_step_context():
369 assert isinstance(self.model, ValidationStep)
--> 370 return self.model.validation_step(*args, **kwargs)

File ~/farmvibes-ai/notebooks/crop_segmentation/notebook_lib/models.py:97, in SegmentationModel.validation_step(self, batch, batch_idx)
96 def validation_step(self, batch: Dict[str, Any], batch_idx: int) -> Dict[str, Any]:
---> 97 return self._shared_step(batch, batch_idx)

File ~/farmvibes-ai/notebooks/crop_segmentation/notebook_lib/models.py:78, in SegmentationModel._shared_step(self, batch, batch_idx)
76 pred = self(batch["image"])
77 for t in pred, batch["mask"]:
---> 78 assert torch.all(torch.isfinite(t))
79 loss = self.loss(pred, batch["mask"])
81 return {"loss": loss, "preds": pred.detach(), "target": batch["mask"]}

AssertionError:

`

@Amr-MKamal
Copy link
Author

@rafaspadilha as a final solution I thought about going to notebook_lib/models.py and I commented this section
` # for t in pred, batch["mask"]:

assert torch.all(torch.isfinite(t))`

the rest of the cells in the local training notebook worked successfully and I was able to save the model to an onnx model
however in 04_inference notebook the interference workload could not complete with the following error

RuntimeError: Failed to run op compute_onnx_from_sequence in workflow run id 3f6de9b0-45ba-43cb-a92e-04025bce9f6c for input with message id 00-3f6de9b045ba43cba92e04025bce9f6c-2c866d66b2dc8cdf-01. Error description: <class 'RuntimeError'>: Traceback (most recent call last):\n File "/opt/conda/lib/python3.8/site-packages/vibe_agent/worker.py", line 123, in run_op\n return factory.build(spec).run(input, cache_info)\n File "/opt/conda/lib/python3.8/site-packages/vibe_agent/ops.py", line 106, in run\n stac_results = self._call_validate_op(**{**items, **raw_items})\n File "/opt/conda/lib/python3.8/site-packages/vibe_agent/ops.py", line 72, in _call_validate_op\n results = self.callback(**kwargs)\n File "/app/ops/compute_onnx/compute_onnx.py", line 65, in compute_onnx\n model = ort.InferenceSession(model_path)\n File "/opt/conda/lib/python3.8/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py", line 347, in init\n self._create_inference_session(providers, provider_options, disabled_optimizers)\n File "/opt/conda/lib/python3.8/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py", line 384, in _create_inference_session\n sess = C.InferenceSession(session_options, self._model_path, True, self._read_config_from_model)\nonnxruntime.capi.onnxruntime_pybind11_state.InvalidProtobuf: [ONNXRuntimeError] : 7 : INVALID_PROTOBUF : Load model from /mnt/onnx_resources/ failed:Protobuf parsing failed.\n.

Note: that I get this error running the provided example in terms of area & date (2020) with the provided environment

@tarishijain
Copy link

I was getting a similar error at the training stage after running the trainer.fit(model, data). Also, I was working with a reduced dataset of 6 months rather than a year. Will the above solution work here and also is it advisable to train the model for less than a year?
I have used the same region and the provided crop_env.yaml conda environment.


AssertionError Traceback (most recent call last)
/tmp/ipykernel_294729/3723144614.py in
4 else:
5 # Train it now
----> 6 trainer.fit(model, data)

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
694 """
695 self.strategy.model = model
--> 696 self._call_and_handle_interrupt(
697 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
698 )

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
648 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
649 else:
--> 650 return trainer_fn(*args, **kwargs)
651 # TODO(awaelchli): Unify both exceptions below, where KeyboardError doesn't re-raise
652 except KeyboardInterrupt as exception:

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
733 ckpt_path, model_provided=True, model_connected=self.lightning_module is not None
734 )
--> 735 results = self._run(model, ckpt_path=self.ckpt_path)
736
737 assert self.state.stopped

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
1164 self._checkpoint_connector.resume_end()
1165
-> 1166 results = self._run_stage()
1167
1168 log.detail(f"{self.class.name}: trainer tearing down")

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _run_stage(self)
1250 if self.predicting:
1251 return self._run_predict()
-> 1252 return self._run_train()
1253
1254 def _pre_training_routine(self):

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _run_train(self)
1272
1273 with isolate_rng():
-> 1274 self._run_sanity_check()
1275
1276 # enable train mode

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _run_sanity_check(self)
1341 # run eval step
1342 with torch.no_grad():
-> 1343 val_loop.run()
1344
1345 self._call_callback_hooks("on_sanity_check_end")

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py in run(self, *args, **kwargs)
198 try:
199 self.on_advance_start(*args, **kwargs)
--> 200 self.advance(*args, **kwargs)
201 self.on_advance_end()
202 self._restarting = False

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py in advance(self, *args, **kwargs)
153 if self.num_dataloaders > 1:
154 kwargs["dataloader_idx"] = dataloader_idx
--> 155 dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
156
157 # store batch level output per dataloader

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py in run(self, *args, **kwargs)
198 try:
199 self.on_advance_start(*args, **kwargs)
--> 200 self.advance(*args, **kwargs)
201 self.on_advance_end()
202 self._restarting = False

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py in advance(self, data_fetcher, dl_max_batches, kwargs)
141
142 # lightning module methods
--> 143 output = self._evaluation_step(**kwargs)
144 output = self._evaluation_step_end(output)
145

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py in _evaluation_step(self, **kwargs)
238 """
239 hook_name = "test_step" if self.trainer.testing else "validation_step"
--> 240 output = self.trainer._call_strategy_hook(hook_name, *kwargs.values())
241
242 return output

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _call_strategy_hook(self, hook_name, *args, **kwargs)
1702
1703 with self.profiler.profile(f"[Strategy]{self.strategy.class.name}.{hook_name}"):
-> 1704 output = fn(*args, **kwargs)
1705
1706 # restore current_fx when nested context

~/miniconda3/envs/crop-seg/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py in validation_step(self, *args, **kwargs)
368 with self.precision_plugin.val_step_context():
369 assert isinstance(self.model, ValidationStep)
--> 370 return self.model.validation_step(*args, **kwargs)
371
372 def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:

~/farmvibes-ai/notebooks/crop_segmentation/notebook_lib/models.py in validation_step(self, batch, batch_idx)
95
96 def validation_step(self, batch: Dict[str, Any], batch_idx: int) -> Dict[str, Any]:
---> 97 return self._shared_step(batch, batch_idx)
98
99 def validation_step_end(self, outputs: Dict[str, Any]) -> None:

~/farmvibes-ai/notebooks/crop_segmentation/notebook_lib/models.py in _shared_step(self, batch, batch_idx)
76 pred = self(batch["image"])
77 for t in pred, batch["mask"]:
---> 78 assert torch.all(torch.isfinite(t))
79 loss = self.loss(pred, batch["mask"])
80

AssertionError:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
notebooks Issues encountered while running the notebooks
Projects
None yet
Development

No branches or pull requests

3 participants