Revision 7213
Added by ben leinfelder over 12 years ago
src/edu/ucsb/nceas/metacat/dataone/hazelcast/HazelcastService.java | ||
---|---|---|
492 | 492 |
* Make sure we have a copy of every entry in the shared map. |
493 | 493 |
* We use lazy loading and therefore the CNs may not all be in sync when one |
494 | 494 |
* comes back online after an extended period of being offline |
495 |
* This method is meant to retrieve shared SystemMetadata that does not exist locally. |
|
496 |
* A mak=jor shortcoming here is that if Hazelcast has assigned this node ownership of a PID |
|
497 |
* that does not exist on the local backing store, the SystemMetadata will be null. |
|
495 | 498 |
* @throws Exception |
499 |
* @deprecated |
|
496 | 500 |
*/ |
497 | 501 |
private void resynch() throws Exception { |
498 | 502 |
// loop through all the [shared] entries and save any missing ones locally |
... | ... | |
522 | 526 |
} |
523 | 527 |
} |
524 | 528 |
|
529 |
/** |
|
530 |
* Make sure we have a copy of every entry in the shared map. |
|
531 |
* We use lazy loading and therefore the CNs may not all be in sync when one |
|
532 |
* comes back online after an extended period of being offline |
|
533 |
* This method loops through the entries that a FULLY UP-TO-DATE CN has |
|
534 |
* and makes sure each one is present on the shared map. |
|
535 |
* It is meant to overcome a HZ weakness wherein ownership of a key results in |
|
536 |
* null values where the owner does not have a complete backing store. |
|
537 |
* This will be an expensive routine and should be run in a background process so that |
|
538 |
* the server can continue to service other requests during the synch |
|
539 |
* @throws Exception |
|
540 |
*/ |
|
541 |
private void resynchToRemote() throws Exception { |
|
542 |
// loop through all the [shared] entries and save any missing ones locally |
|
543 |
List<String> localPids = IdentifierManager.getInstance().getAllSystemMetadataGUIDs(); |
|
544 |
logMetacat.warn("local pid count: " + localPids.size() + ", shared pid count: " + identifiers.size()); |
|
545 |
if (localPids.size() == identifiers.size()) { |
|
546 |
|
|
547 |
//loop through all the pids to find any nulls |
|
548 |
Iterator<Identifier> sharedPids = identifiers.iterator(); |
|
549 |
while (sharedPids.hasNext()) { |
|
550 |
Identifier pid = sharedPids.next(); |
|
551 |
logMetacat.trace("checking ownership for shared pid: " + pid.getValue()); |
|
552 |
Partition partition = hzInstance.getPartitionService().getPartition(pid); |
|
553 |
Member owner = partition.getOwner(); |
|
554 |
boolean isLocalPid = owner.localMember(); |
|
555 |
logMetacat.warn("owner of pid: " + pid.getValue() + " isLocal: " + isLocalPid); |
|
556 |
if (!isLocalPid) { |
|
557 |
logMetacat.trace("looking up shared value for pid: " + pid.getValue()); |
|
558 |
SystemMetadata sm = systemMetadata.get(pid); |
|
559 |
if (sm == null) { |
|
560 |
logMetacat.error("shared SystemMetadata for pid is null: " + pid.getValue()); |
|
561 |
// get directly from backing store |
|
562 |
sm = IdentifierManager.getInstance().getSystemMetadata(pid.getValue()); |
|
563 |
if (sm != null) { |
|
564 |
logMetacat.trace("saving local SystemMetadata to shared map for pid: " + pid.getValue()); |
|
565 |
systemMetadata.put(pid, sm); |
|
566 |
} else { |
|
567 |
logMetacat.error("local SystemMetadata is null for pid: " + pid.getValue()); |
|
568 |
} |
|
569 |
} |
|
570 |
} |
|
571 |
} |
|
572 |
} |
|
573 |
} |
|
574 |
|
|
525 | 575 |
private void resynchInThread() { |
526 | 576 |
logMetacat.debug("launching system metadata resynch in a thread"); |
527 | 577 |
ExecutorService executor = Executors.newSingleThreadExecutor(); |
... | ... | |
529 | 579 |
@Override |
530 | 580 |
public void run() { |
531 | 581 |
try { |
532 |
resynch(); |
|
582 |
// this is a pull mechanism |
|
583 |
//resynch(); |
|
584 |
// this is a push mechanism |
|
585 |
resynchToRemote(); |
|
533 | 586 |
} catch (Exception e) { |
534 | 587 |
logMetacat.error("Error in resynchInThread: " + e.getMessage(), e); |
535 | 588 |
} |
Also available in: Unified diff
push SystemMetadata entries from the CN that has them all to the shared map where other nodes may not have all entries. The CN with the complete copy only pushes SM entries that it does not own and that return as null because those are the ones that are missing on the other, non-complete CNs.
This is different from the previous approach where a stale CN tried to PULL it's missing entries from the shared map. Because of HZ key ownership, there is no guarantee that a stale node will have SystemMetadata DB records for all the pids it "owns"