Project

General

Profile

« Previous | Next » 

Revision 7213

push SystemMetadata entries from the CN that has them all to the shared map where other nodes may not have all entries. The CN with the complete copy only pushes SM entries that it does not own and that return as null because those are the ones that are missing on the other, non-complete CNs.
This is different from the previous approach where a stale CN tried to PULL it's missing entries from the shared map. Because of HZ key ownership, there is no guarantee that a stale node will have SystemMetadata DB records for all the pids it "owns"

View differences:

src/edu/ucsb/nceas/metacat/dataone/hazelcast/HazelcastService.java
492 492
	 * Make sure we have a copy of every entry in the shared map.
493 493
	 * We use lazy loading and therefore the CNs may not all be in sync when one
494 494
	 * comes back online after an extended period of being offline
495
	 * This method is meant to retrieve shared SystemMetadata that does not exist locally.
496
	 * A mak=jor shortcoming here is that if Hazelcast has assigned this node ownership of a PID
497
	 * that does not exist on the local backing store, the SystemMetadata will be null.
495 498
	 * @throws Exception
499
	 * @deprecated
496 500
	 */
497 501
	private void resynch() throws Exception {
498 502
		// loop through all the [shared] entries and save any missing ones locally
......
522 526
		}
523 527
	}
524 528
	
529
	/**
530
	 * Make sure we have a copy of every entry in the shared map.
531
	 * We use lazy loading and therefore the CNs may not all be in sync when one
532
	 * comes back online after an extended period of being offline
533
	 * This method loops through the entries that a FULLY UP-TO-DATE CN has
534
	 * and makes sure each one is present on the shared map.
535
	 * It is meant to overcome a HZ weakness wherein ownership of a key results in 
536
	 * null values where the owner does not have a complete backing store.
537
	 * This will be an expensive routine and should be run in a background process so that
538
	 * the server can continue to service other requests during the synch
539
	 * @throws Exception
540
	 */
541
	private void resynchToRemote() throws Exception {
542
		// loop through all the [shared] entries and save any missing ones locally
543
		List<String> localPids = IdentifierManager.getInstance().getAllSystemMetadataGUIDs();
544
		logMetacat.warn("local pid count: " + localPids.size() + ", shared pid count: " + identifiers.size());
545
		if (localPids.size() == identifiers.size()) {
546
		
547
			//loop through all the pids to find any nulls
548
			Iterator<Identifier> sharedPids = identifiers.iterator();
549
			while (sharedPids.hasNext()) {
550
				Identifier pid = sharedPids.next();
551
				logMetacat.trace("checking ownership for shared pid: " + pid.getValue());
552
				Partition partition = hzInstance.getPartitionService().getPartition(pid);
553
				Member owner = partition.getOwner();
554
				boolean isLocalPid = owner.localMember();
555
				logMetacat.warn("owner of pid: " + pid.getValue() + " isLocal: " + isLocalPid);
556
				if (!isLocalPid) {
557
					logMetacat.trace("looking up shared value for pid: " + pid.getValue());
558
					SystemMetadata sm = systemMetadata.get(pid);
559
					if (sm == null)  {
560
						logMetacat.error("shared SystemMetadata for pid is null: " + pid.getValue());
561
						// get directly from backing store
562
						sm = IdentifierManager.getInstance().getSystemMetadata(pid.getValue());
563
						if (sm != null)  {
564
							logMetacat.trace("saving local SystemMetadata to shared map for pid: " + pid.getValue());
565
							systemMetadata.put(pid, sm);
566
						} else {
567
							logMetacat.error("local SystemMetadata is null for pid: " + pid.getValue());
568
						}
569
					}
570
				}
571
			}
572
		}
573
	}
574
	
525 575
	private void resynchInThread() {
526 576
		logMetacat.debug("launching system metadata resynch in a thread");
527 577
		ExecutorService executor = Executors.newSingleThreadExecutor();
......
529 579
			@Override
530 580
			public void run() {
531 581
				try {
532
					resynch();
582
					// this is a pull mechanism
583
					//resynch();
584
					// this is a push mechanism
585
					resynchToRemote();
533 586
				} catch (Exception e) {
534 587
					logMetacat.error("Error in resynchInThread: " + e.getMessage(), e);
535 588
				}

Also available in: Unified diff