Skip to content

Commit c9f1c57

Browse files
hsato03Henrique Sato
and
Henrique Sato
authored
Fix snapshot scheduling with expired jobs (#8832)
Co-authored-by: Henrique Sato <[email protected]>
1 parent 1e12a80 commit c9f1c57

File tree

6 files changed

+113
-93
lines changed

6 files changed

+113
-93
lines changed

engine/schema/src/main/java/com/cloud/storage/SnapshotScheduleVO.java

+9
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
import javax.persistence.TemporalType;
3030

3131
import com.cloud.storage.snapshot.SnapshotSchedule;
32+
import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
33+
import org.apache.commons.lang3.builder.ToStringStyle;
3234

3335
@Entity
3436
@Table(name = "snapshot_schedule")
@@ -132,4 +134,11 @@ public String getUuid() {
132134
public void setUuid(String uuid) {
133135
this.uuid = uuid;
134136
}
137+
138+
@Override
139+
public String toString() {
140+
ReflectionToStringBuilder reflectionToStringBuilder = new ReflectionToStringBuilder(this, ToStringStyle.JSON_STYLE);
141+
reflectionToStringBuilder.setExcludeFieldNames("id");
142+
return reflectionToStringBuilder.toString();
143+
}
135144
}

engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDao.java

+2-4
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,11 @@
2727
*/
2828
public interface SnapshotScheduleDao extends GenericDao<SnapshotScheduleVO, Long> {
2929

30-
List<SnapshotScheduleVO> getCoincidingSnapshotSchedules(long volumeId, Date date);
31-
3230
List<SnapshotScheduleVO> getSchedulesToExecute(Date currentTimestamp);
3331

34-
SnapshotScheduleVO getCurrentSchedule(Long volumeId, Long policyId, boolean executing);
32+
List<SnapshotScheduleVO> getSchedulesAssignedWithAsyncJob();
3533

36-
SnapshotScheduleVO findOneByVolume(long volumeId);
34+
SnapshotScheduleVO getCurrentSchedule(Long volumeId, Long policyId, boolean executing);
3735

3836
SnapshotScheduleVO findOneByVolumePolicy(long volumeId, long policyId);
3937

engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDaoImpl.java

+9-26
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
public class SnapshotScheduleDaoImpl extends GenericDaoBase<SnapshotScheduleVO, Long> implements SnapshotScheduleDao {
3333
protected final SearchBuilder<SnapshotScheduleVO> executableSchedulesSearch;
3434
protected final SearchBuilder<SnapshotScheduleVO> coincidingSchedulesSearch;
35-
private final SearchBuilder<SnapshotScheduleVO> VolumeIdSearch;
35+
protected final SearchBuilder<SnapshotScheduleVO> schedulesAssignedWithAsyncJob;
3636
private final SearchBuilder<SnapshotScheduleVO> VolumeIdPolicyIdSearch;
3737

3838
protected SnapshotScheduleDaoImpl() {
@@ -48,36 +48,14 @@ protected SnapshotScheduleDaoImpl() {
4848
coincidingSchedulesSearch.and("asyncJobId", coincidingSchedulesSearch.entity().getAsyncJobId(), SearchCriteria.Op.NULL);
4949
coincidingSchedulesSearch.done();
5050

51-
VolumeIdSearch = createSearchBuilder();
52-
VolumeIdSearch.and("volumeId", VolumeIdSearch.entity().getVolumeId(), SearchCriteria.Op.EQ);
53-
VolumeIdSearch.done();
54-
5551
VolumeIdPolicyIdSearch = createSearchBuilder();
5652
VolumeIdPolicyIdSearch.and("volumeId", VolumeIdPolicyIdSearch.entity().getVolumeId(), SearchCriteria.Op.EQ);
5753
VolumeIdPolicyIdSearch.and("policyId", VolumeIdPolicyIdSearch.entity().getPolicyId(), SearchCriteria.Op.EQ);
5854
VolumeIdPolicyIdSearch.done();
5955

60-
}
61-
62-
/**
63-
* {@inheritDoc}
64-
*/
65-
@Override
66-
public List<SnapshotScheduleVO> getCoincidingSnapshotSchedules(long volumeId, Date date) {
67-
SearchCriteria<SnapshotScheduleVO> sc = coincidingSchedulesSearch.create();
68-
sc.setParameters("volumeId", volumeId);
69-
sc.setParameters("scheduledTimestamp", date);
70-
// Don't return manual snapshots. They will be executed through another
71-
// code path.
72-
sc.addAnd("policyId", SearchCriteria.Op.NEQ, 1L);
73-
return listBy(sc);
74-
}
75-
76-
@Override
77-
public SnapshotScheduleVO findOneByVolume(long volumeId) {
78-
SearchCriteria<SnapshotScheduleVO> sc = VolumeIdSearch.create();
79-
sc.setParameters("volumeId", volumeId);
80-
return findOneBy(sc);
56+
schedulesAssignedWithAsyncJob = createSearchBuilder();
57+
schedulesAssignedWithAsyncJob.and("asyncJobId", schedulesAssignedWithAsyncJob.entity().getAsyncJobId(), SearchCriteria.Op.NNULL);
58+
schedulesAssignedWithAsyncJob.done();
8159
}
8260

8361
@Override
@@ -98,6 +76,11 @@ public List<SnapshotScheduleVO> getSchedulesToExecute(Date currentTimestamp) {
9876
return listBy(sc);
9977
}
10078

79+
@Override
80+
public List<SnapshotScheduleVO> getSchedulesAssignedWithAsyncJob() {
81+
return listBy(schedulesAssignedWithAsyncJob.create());
82+
}
83+
10184
/**
10285
* {@inheritDoc}
10386
*/

engine/schema/src/main/resources/META-INF/db/schema-41910to42000.sql

+3
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ CREATE TABLE IF NOT EXISTS `cloud_usage`.`quota_email_configuration`(
9090
CONSTRAINT `FK_quota_email_configuration_account_id` FOREIGN KEY (`account_id`) REFERENCES `cloud_usage`.`quota_account`(`account_id`),
9191
CONSTRAINT `FK_quota_email_configuration_email_template_id` FOREIGN KEY (`email_template_id`) REFERENCES `cloud_usage`.`quota_email_templates`(`id`));
9292

93+
-- Remove on delete cascade from snapshot schedule
94+
ALTER TABLE `cloud`.`snapshot_schedule` DROP CONSTRAINT `fk__snapshot_schedule_async_job_id`;
95+
9396
-- Add `is_implicit` column to `host_tags` table
9497
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.host_tags', 'is_implicit', 'int(1) UNSIGNED NOT NULL DEFAULT 0 COMMENT "If host tag is implicit or explicit" ');
9598

server/src/main/java/com/cloud/storage/snapshot/SnapshotSchedulerImpl.java

+31-63
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import org.apache.cloudstack.framework.jobs.AsyncJobManager;
3636
import org.apache.cloudstack.framework.jobs.dao.AsyncJobDao;
3737
import org.apache.cloudstack.framework.jobs.impl.AsyncJobVO;
38+
import org.apache.cloudstack.jobs.JobInfo;
3839
import org.apache.cloudstack.managed.context.ManagedContextTimerTask;
3940
import org.springframework.stereotype.Component;
4041

@@ -47,7 +48,6 @@
4748
import com.cloud.storage.Snapshot;
4849
import com.cloud.storage.SnapshotPolicyVO;
4950
import com.cloud.storage.SnapshotScheduleVO;
50-
import com.cloud.storage.SnapshotVO;
5151
import com.cloud.storage.VolumeVO;
5252
import com.cloud.storage.dao.SnapshotDao;
5353
import com.cloud.storage.dao.SnapshotPolicyDao;
@@ -64,7 +64,6 @@
6464
import com.cloud.utils.concurrency.TestClock;
6565
import com.cloud.utils.db.DB;
6666
import com.cloud.utils.db.GlobalLock;
67-
import com.cloud.utils.db.SearchCriteria;
6867
import com.cloud.utils.db.TransactionLegacy;
6968
import com.cloud.vm.snapshot.VMSnapshotManager;
7069
import com.cloud.vm.snapshot.VMSnapshotVO;
@@ -144,7 +143,7 @@ public void poll(final Date currentTimestamp) {
144143
try {
145144
if (scanLock.lock(ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_COOPERATION)) {
146145
try {
147-
checkStatusOfCurrentlyExecutingSnapshots();
146+
scheduleNextSnapshotJobsIfNecessary();
148147
} finally {
149148
scanLock.unlock();
150149
}
@@ -174,68 +173,37 @@ public void poll(final Date currentTimestamp) {
174173
}
175174
}
176175

177-
private void checkStatusOfCurrentlyExecutingSnapshots() {
178-
final SearchCriteria<SnapshotScheduleVO> sc = _snapshotScheduleDao.createSearchCriteria();
179-
sc.addAnd("asyncJobId", SearchCriteria.Op.NNULL);
180-
final List<SnapshotScheduleVO> snapshotSchedules = _snapshotScheduleDao.search(sc, null);
181-
for (final SnapshotScheduleVO snapshotSchedule : snapshotSchedules) {
182-
final Long asyncJobId = snapshotSchedule.getAsyncJobId();
183-
final AsyncJobVO asyncJob = _asyncJobDao.findByIdIncludingRemoved(asyncJobId);
184-
switch (asyncJob.getStatus()) {
185-
case SUCCEEDED:
186-
// The snapshot has been successfully backed up.
187-
// The snapshot state has also been cleaned up.
188-
// We can schedule the next job for this snapshot.
189-
// Remove the existing entry in the snapshot_schedule table.
190-
scheduleNextSnapshotJob(snapshotSchedule);
191-
break;
192-
case FAILED:
193-
// Check the snapshot status.
194-
final Long snapshotId = snapshotSchedule.getSnapshotId();
195-
if (snapshotId == null) {
196-
// createSnapshotAsync exited, successfully or unsuccessfully,
197-
// even before creating a snapshot record
198-
// No cleanup needs to be done.
199-
// Schedule the next snapshot.
200-
scheduleNextSnapshotJob(snapshotSchedule);
201-
} else {
202-
final SnapshotVO snapshot = _snapshotDao.findById(snapshotId);
203-
if (snapshot == null || snapshot.getRemoved() != null) {
204-
// This snapshot has been deleted successfully from the primary storage
205-
// Again no cleanup needs to be done.
206-
// Schedule the next snapshot.
207-
// There's very little probability that the code reaches this point.
208-
// The snapshotId is a foreign key for the snapshot_schedule table
209-
// set to ON DELETE CASCADE. So if the snapshot entry is deleted, the snapshot_schedule entry will be too.
210-
// But what if it has only been marked as removed?
211-
scheduleNextSnapshotJob(snapshotSchedule);
212-
} else {
213-
// The management server executing this snapshot job appears to have crashed
214-
// while creating the snapshot on primary storage/or backing it up.
215-
// We have no idea whether the snapshot was successfully taken on the primary or not.
216-
// Schedule the next snapshot job.
217-
// The ValidatePreviousSnapshotCommand will take appropriate action on this snapshot
218-
// If the snapshot was taken successfully on primary, it will retry backing it up.
219-
// and cleanup the previous snapshot
220-
// Set the userId to that of system.
221-
//_snapshotManager.validateSnapshot(1L, snapshot);
222-
// In all cases, schedule the next snapshot job
223-
scheduleNextSnapshotJob(snapshotSchedule);
224-
}
225-
}
176+
private void scheduleNextSnapshotJobsIfNecessary() {
177+
List<SnapshotScheduleVO> snapshotSchedules = _snapshotScheduleDao.getSchedulesAssignedWithAsyncJob();
178+
logger.info("Verifying the current state of [{}] snapshot schedules and scheduling next jobs, if necessary.", snapshotSchedules.size());
179+
for (SnapshotScheduleVO snapshotSchedule : snapshotSchedules) {
180+
scheduleNextSnapshotJobIfNecessary(snapshotSchedule);
181+
}
182+
}
226183

227-
break;
228-
case IN_PROGRESS:
229-
// There is no way of knowing from here whether
230-
// 1) Another management server is processing this snapshot job
231-
// 2) The management server has crashed and this snapshot is lying
232-
// around in an inconsistent state.
233-
// Hopefully, this can be resolved at the backend when the current snapshot gets executed.
234-
// But if it remains in this state, the current snapshot will not get executed.
235-
// And it will remain in stasis.
236-
break;
237-
}
184+
protected void scheduleNextSnapshotJobIfNecessary(SnapshotScheduleVO snapshotSchedule) {
185+
Long asyncJobId = snapshotSchedule.getAsyncJobId();
186+
AsyncJobVO asyncJob = _asyncJobDao.findByIdIncludingRemoved(asyncJobId);
187+
188+
if (asyncJob == null) {
189+
logger.debug("The async job [{}] of snapshot schedule [{}] does not exist anymore. Considering it as finished and scheduling the next snapshot job.",
190+
asyncJobId, snapshotSchedule);
191+
scheduleNextSnapshotJob(snapshotSchedule);
192+
return;
238193
}
194+
195+
JobInfo.Status status = asyncJob.getStatus();
196+
197+
if (JobInfo.Status.SUCCEEDED.equals(status)) {
198+
logger.debug("Last job of schedule [{}] succeeded; scheduling the next snapshot job.", snapshotSchedule);
199+
} else if (JobInfo.Status.FAILED.equals(status)) {
200+
logger.debug("Last job of schedule [{}] failed with [{}]; scheduling a new snapshot job.", snapshotSchedule, asyncJob.getResult());
201+
} else {
202+
logger.debug("Schedule [{}] is still in progress, skipping next job scheduling.", snapshotSchedule);
203+
return;
204+
}
205+
206+
scheduleNextSnapshotJob(snapshotSchedule);
239207
}
240208

241209
@DB

server/src/test/java/com/cloud/storage/snapshot/SnapshotSchedulerImplTest.java

+59
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
import com.cloud.user.Account;
2727
import com.cloud.user.AccountVO;
2828
import com.cloud.user.dao.AccountDao;
29+
import org.apache.cloudstack.framework.jobs.dao.AsyncJobDao;
30+
import org.apache.cloudstack.framework.jobs.impl.AsyncJobVO;
31+
import org.apache.cloudstack.jobs.JobInfo;
2932
import org.junit.Assert;
3033
import org.junit.Test;
3134
import org.junit.runner.RunWith;
@@ -65,6 +68,16 @@ public class SnapshotSchedulerImplTest {
6568
@Mock
6669
AccountVO accountVoMock;
6770

71+
@Mock
72+
private SnapshotScheduleVO snapshotScheduleVoMock;
73+
74+
@Mock
75+
private AsyncJobDao asyncJobDaoMock;
76+
77+
@Mock
78+
private AsyncJobVO asyncJobVoMock;
79+
80+
6881
@Test
6982
public void scheduleNextSnapshotJobTestParameterIsNullReturnNull() {
7083
SnapshotScheduleVO snapshotScheduleVO = null;
@@ -215,4 +228,50 @@ public void canSnapshotBeScheduledTestSnapshotPolicyIsNotRemovedDoNotCallRemove(
215228

216229
Mockito.verify(snapshotScheduleDaoMock, Mockito.never()).remove(Mockito.anyLong());
217230
}
231+
232+
@Test
233+
public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobIsNullThenScheduleNextSnapshot() {
234+
Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId();
235+
Mockito.doReturn(null).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any());
236+
Mockito.doReturn(new Date()).when(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
237+
238+
snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock);
239+
240+
Mockito.verify(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
241+
}
242+
243+
@Test
244+
public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobSucceededThenScheduleNextSnapshot() {
245+
Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId();
246+
Mockito.doReturn(asyncJobVoMock).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any());
247+
Mockito.doReturn(JobInfo.Status.SUCCEEDED).when(asyncJobVoMock).getStatus();
248+
Mockito.doReturn(new Date()).when(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
249+
250+
snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock);
251+
252+
Mockito.verify(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
253+
}
254+
255+
@Test
256+
public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobFailedThenScheduleNextSnapshot() {
257+
Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId();
258+
Mockito.doReturn(asyncJobVoMock).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any());
259+
Mockito.doReturn(JobInfo.Status.FAILED).when(asyncJobVoMock).getStatus();
260+
Mockito.doReturn(new Date()).when(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
261+
262+
snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock);
263+
264+
Mockito.verify(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
265+
}
266+
267+
@Test
268+
public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobInProgressThenDoNothing() {
269+
Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId();
270+
Mockito.doReturn(asyncJobVoMock).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any());
271+
Mockito.doReturn(JobInfo.Status.IN_PROGRESS).when(asyncJobVoMock).getStatus();
272+
273+
snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock);
274+
275+
Mockito.verify(snapshotSchedulerImplSpy, Mockito.never()).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
276+
}
218277
}

0 commit comments

Comments
 (0)