001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.namenode; 019 020import static org.apache.hadoop.util.Time.now; 021 022import java.io.DataInput; 023import java.io.DataInputStream; 024import java.io.DataOutputStream; 025import java.io.File; 026import java.io.FileInputStream; 027import java.io.FileNotFoundException; 028import java.io.FileOutputStream; 029import java.io.IOException; 030import java.security.DigestInputStream; 031import java.security.DigestOutputStream; 032import java.security.MessageDigest; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.Collection; 036import java.util.HashMap; 037import java.util.List; 038import java.util.Map; 039import java.util.TreeMap; 040 041import org.apache.commons.logging.Log; 042import org.apache.hadoop.classification.InterfaceAudience; 043import org.apache.hadoop.classification.InterfaceStability; 044import org.apache.hadoop.conf.Configuration; 045import org.apache.hadoop.fs.FileSystem; 046import org.apache.hadoop.fs.Path; 047import org.apache.hadoop.fs.PathIsNotDirectoryException; 048import org.apache.hadoop.fs.UnresolvedLinkException; 049import org.apache.hadoop.fs.permission.PermissionStatus; 050import org.apache.hadoop.hdfs.DFSUtil; 051import org.apache.hadoop.hdfs.protocol.HdfsConstants; 052import org.apache.hadoop.hdfs.protocol.LayoutFlags; 053import org.apache.hadoop.hdfs.protocol.LayoutVersion; 054import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature; 055import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; 056import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction; 057import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; 058import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; 059import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; 060import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature; 061import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList; 062import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot; 063import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat; 064import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap; 065import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase; 066import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress; 067import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter; 068import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step; 069import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType; 070import org.apache.hadoop.hdfs.util.ReadOnlyList; 071import org.apache.hadoop.io.IOUtils; 072import org.apache.hadoop.io.MD5Hash; 073import org.apache.hadoop.io.Text; 074import org.apache.hadoop.util.StringUtils; 075 076import com.google.common.annotations.VisibleForTesting; 077import com.google.common.base.Preconditions; 078 079/** 080 * Contains inner classes for reading or writing the on-disk format for 081 * FSImages. 082 * 083 * In particular, the format of the FSImage looks like: 084 * <pre> 085 * FSImage { 086 * layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long, 087 * namesystemGenerationStampV1: long, namesystemGenerationStampV2: long, 088 * generationStampAtBlockIdSwitch:long, lastAllocatedBlockId: 089 * long transactionID: long, snapshotCounter: int, numberOfSnapshots: int, 090 * numOfSnapshottableDirs: int, 091 * {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed) 092 * } 093 * 094 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) { 095 * INodeInfo of root, numberOfChildren of root: int 096 * [list of INodeInfo of root's children], 097 * [list of INodeDirectoryInfo of root's directory children] 098 * } 099 * 100 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){ 101 * [list of INodeInfo of INodes in topological order] 102 * } 103 * 104 * INodeInfo { 105 * { 106 * localName: short + byte[] 107 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported 108 * or 109 * { 110 * fullPath: byte[] 111 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported 112 * replicationFactor: short, modificationTime: long, 113 * accessTime: long, preferredBlockSize: long, 114 * numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink), 115 * { 116 * nsQuota: long, dsQuota: long, 117 * { 118 * isINodeSnapshottable: byte, 119 * isINodeWithSnapshot: byte (if isINodeSnapshottable is false) 120 * } (when {@link Feature#SNAPSHOT} is supported), 121 * fsPermission: short, PermissionStatus 122 * } for INodeDirectory 123 * or 124 * { 125 * symlinkString, fsPermission: short, PermissionStatus 126 * } for INodeSymlink 127 * or 128 * { 129 * [list of BlockInfo] 130 * [list of FileDiff] 131 * { 132 * isINodeFileUnderConstructionSnapshot: byte, 133 * {clientName: short + byte[], clientMachine: short + byte[]} (when 134 * isINodeFileUnderConstructionSnapshot is true), 135 * } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode), 136 * fsPermission: short, PermissionStatus 137 * } for INodeFile 138 * } 139 * 140 * INodeDirectoryInfo { 141 * fullPath of the directory: short + byte[], 142 * numberOfChildren: int, [list of INodeInfo of children INode], 143 * { 144 * numberOfSnapshots: int, 145 * [list of Snapshot] (when NumberOfSnapshots is positive), 146 * numberOfDirectoryDiffs: int, 147 * [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive), 148 * number of children that are directories, 149 * [list of INodeDirectoryInfo of the directory children] (includes 150 * snapshot copies of deleted sub-directories) 151 * } (when {@link Feature#SNAPSHOT} is supported), 152 * } 153 * 154 * Snapshot { 155 * snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is 156 * the name of the snapshot) 157 * } 158 * 159 * DirectoryDiff { 160 * full path of the root of the associated Snapshot: short + byte[], 161 * childrenSize: int, 162 * isSnapshotRoot: byte, 163 * snapshotINodeIsNotNull: byte (when isSnapshotRoot is false), 164 * snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff 165 * } 166 * 167 * Diff { 168 * createdListSize: int, [Local name of INode in created list], 169 * deletedListSize: int, [INode in deleted list: INodeInfo] 170 * } 171 * 172 * FileDiff { 173 * full path of the root of the associated Snapshot: short + byte[], 174 * fileSize: long, 175 * snapshotINodeIsNotNull: byte, 176 * snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff 177 * } 178 * </pre> 179 */ 180@InterfaceAudience.Private 181@InterfaceStability.Evolving 182public class FSImageFormat { 183 private static final Log LOG = FSImage.LOG; 184 185 // Static-only class 186 private FSImageFormat() {} 187 188 interface AbstractLoader { 189 MD5Hash getLoadedImageMd5(); 190 long getLoadedImageTxId(); 191 } 192 193 static class LoaderDelegator implements AbstractLoader { 194 private AbstractLoader impl; 195 private final Configuration conf; 196 private final FSNamesystem fsn; 197 198 LoaderDelegator(Configuration conf, FSNamesystem fsn) { 199 this.conf = conf; 200 this.fsn = fsn; 201 } 202 203 @Override 204 public MD5Hash getLoadedImageMd5() { 205 return impl.getLoadedImageMd5(); 206 } 207 208 @Override 209 public long getLoadedImageTxId() { 210 return impl.getLoadedImageTxId(); 211 } 212 213 public void load(File file, boolean requireSameLayoutVersion) 214 throws IOException { 215 Preconditions.checkState(impl == null, "Image already loaded!"); 216 217 FileInputStream is = null; 218 try { 219 is = new FileInputStream(file); 220 byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length]; 221 IOUtils.readFully(is, magic, 0, magic.length); 222 if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) { 223 FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader( 224 conf, fsn, requireSameLayoutVersion); 225 impl = loader; 226 loader.load(file); 227 } else { 228 Loader loader = new Loader(conf, fsn); 229 impl = loader; 230 loader.load(file); 231 } 232 } finally { 233 IOUtils.cleanup(LOG, is); 234 } 235 } 236 } 237 238 /** 239 * Construct a loader class to load the image. It chooses the loader based on 240 * the layout version. 241 */ 242 public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) { 243 return new LoaderDelegator(conf, fsn); 244 } 245 246 /** 247 * A one-shot class responsible for loading an image. The load() function 248 * should be called once, after which the getter methods may be used to retrieve 249 * information about the image that was loaded, if loading was successful. 250 */ 251 public static class Loader implements AbstractLoader { 252 private final Configuration conf; 253 /** which namesystem this loader is working for */ 254 private final FSNamesystem namesystem; 255 256 /** Set to true once a file has been loaded using this loader. */ 257 private boolean loaded = false; 258 259 /** The transaction ID of the last edit represented by the loaded file */ 260 private long imgTxId; 261 /** The MD5 sum of the loaded file */ 262 private MD5Hash imgDigest; 263 264 private Map<Integer, Snapshot> snapshotMap = null; 265 private final ReferenceMap referenceMap = new ReferenceMap(); 266 267 Loader(Configuration conf, FSNamesystem namesystem) { 268 this.conf = conf; 269 this.namesystem = namesystem; 270 } 271 272 /** 273 * Return the MD5 checksum of the image that has been loaded. 274 * @throws IllegalStateException if load() has not yet been called. 275 */ 276 @Override 277 public MD5Hash getLoadedImageMd5() { 278 checkLoaded(); 279 return imgDigest; 280 } 281 282 @Override 283 public long getLoadedImageTxId() { 284 checkLoaded(); 285 return imgTxId; 286 } 287 288 /** 289 * Throw IllegalStateException if load() has not yet been called. 290 */ 291 private void checkLoaded() { 292 if (!loaded) { 293 throw new IllegalStateException("Image not yet loaded!"); 294 } 295 } 296 297 /** 298 * Throw IllegalStateException if load() has already been called. 299 */ 300 private void checkNotLoaded() { 301 if (loaded) { 302 throw new IllegalStateException("Image already loaded!"); 303 } 304 } 305 306 public void load(File curFile) throws IOException { 307 checkNotLoaded(); 308 assert curFile != null : "curFile is null"; 309 310 StartupProgress prog = NameNode.getStartupProgress(); 311 Step step = new Step(StepType.INODES); 312 prog.beginStep(Phase.LOADING_FSIMAGE, step); 313 long startTime = now(); 314 315 // 316 // Load in bits 317 // 318 MessageDigest digester = MD5Hash.getDigester(); 319 DigestInputStream fin = new DigestInputStream( 320 new FileInputStream(curFile), digester); 321 322 DataInputStream in = new DataInputStream(fin); 323 try { 324 // read image version: first appeared in version -1 325 int imgVersion = in.readInt(); 326 if (getLayoutVersion() != imgVersion) { 327 throw new InconsistentFSStateException(curFile, 328 "imgVersion " + imgVersion + 329 " expected to be " + getLayoutVersion()); 330 } 331 boolean supportSnapshot = NameNodeLayoutVersion.supports( 332 LayoutVersion.Feature.SNAPSHOT, imgVersion); 333 if (NameNodeLayoutVersion.supports( 334 LayoutVersion.Feature.ADD_LAYOUT_FLAGS, imgVersion)) { 335 LayoutFlags.read(in); 336 } 337 338 // read namespaceID: first appeared in version -2 339 in.readInt(); 340 341 long numFiles = in.readLong(); 342 343 // read in the last generation stamp for legacy blocks. 344 long genstamp = in.readLong(); 345 namesystem.setGenerationStampV1(genstamp); 346 347 if (NameNodeLayoutVersion.supports( 348 LayoutVersion.Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) { 349 // read the starting generation stamp for sequential block IDs 350 genstamp = in.readLong(); 351 namesystem.setGenerationStampV2(genstamp); 352 353 // read the last generation stamp for blocks created after 354 // the switch to sequential block IDs. 355 long stampAtIdSwitch = in.readLong(); 356 namesystem.setGenerationStampV1Limit(stampAtIdSwitch); 357 358 // read the max sequential block ID. 359 long maxSequentialBlockId = in.readLong(); 360 namesystem.setLastAllocatedBlockId(maxSequentialBlockId); 361 } else { 362 long startingGenStamp = namesystem.upgradeGenerationStampToV2(); 363 // This is an upgrade. 364 LOG.info("Upgrading to sequential block IDs. Generation stamp " + 365 "for new blocks set to " + startingGenStamp); 366 } 367 368 // read the transaction ID of the last edit represented by 369 // this image 370 if (NameNodeLayoutVersion.supports( 371 LayoutVersion.Feature.STORED_TXIDS, imgVersion)) { 372 imgTxId = in.readLong(); 373 } else { 374 imgTxId = 0; 375 } 376 377 // read the last allocated inode id in the fsimage 378 if (NameNodeLayoutVersion.supports( 379 LayoutVersion.Feature.ADD_INODE_ID, imgVersion)) { 380 long lastInodeId = in.readLong(); 381 namesystem.resetLastInodeId(lastInodeId); 382 if (LOG.isDebugEnabled()) { 383 LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId); 384 } 385 } else { 386 if (LOG.isDebugEnabled()) { 387 LOG.debug("Old layout version doesn't have inode id." 388 + " Will assign new id for each inode."); 389 } 390 } 391 392 if (supportSnapshot) { 393 snapshotMap = namesystem.getSnapshotManager().read(in, this); 394 } 395 396 // read compression related info 397 FSImageCompression compression; 398 if (NameNodeLayoutVersion.supports( 399 LayoutVersion.Feature.FSIMAGE_COMPRESSION, imgVersion)) { 400 compression = FSImageCompression.readCompressionHeader(conf, in); 401 } else { 402 compression = FSImageCompression.createNoopCompression(); 403 } 404 in = compression.unwrapInputStream(fin); 405 406 LOG.info("Loading image file " + curFile + " using " + compression); 407 408 // load all inodes 409 LOG.info("Number of files = " + numFiles); 410 prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles); 411 Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step); 412 if (NameNodeLayoutVersion.supports( 413 LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, imgVersion)) { 414 if (supportSnapshot) { 415 loadLocalNameINodesWithSnapshot(numFiles, in, counter); 416 } else { 417 loadLocalNameINodes(numFiles, in, counter); 418 } 419 } else { 420 loadFullNameINodes(numFiles, in, counter); 421 } 422 423 loadFilesUnderConstruction(in, supportSnapshot, counter); 424 prog.endStep(Phase.LOADING_FSIMAGE, step); 425 // Now that the step is finished, set counter equal to total to adjust 426 // for possible under-counting due to reference inodes. 427 prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles); 428 429 loadSecretManagerState(in); 430 431 loadCacheManagerState(in); 432 433 // make sure to read to the end of file 434 boolean eof = (in.read() == -1); 435 assert eof : "Should have reached the end of image file " + curFile; 436 } finally { 437 in.close(); 438 } 439 440 imgDigest = new MD5Hash(digester.digest()); 441 loaded = true; 442 443 LOG.info("Image file " + curFile + " of size " + curFile.length() + 444 " bytes loaded in " + (now() - startTime)/1000 + " seconds."); 445 } 446 447 /** Update the root node's attributes */ 448 private void updateRootAttr(INodeWithAdditionalFields root) { 449 final Quota.Counts q = root.getQuotaCounts(); 450 final long nsQuota = q.get(Quota.NAMESPACE); 451 final long dsQuota = q.get(Quota.DISKSPACE); 452 FSDirectory fsDir = namesystem.dir; 453 if (nsQuota != -1 || dsQuota != -1) { 454 fsDir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota); 455 } 456 fsDir.rootDir.cloneModificationTime(root); 457 fsDir.rootDir.clonePermissionStatus(root); 458 } 459 460 /** 461 * Load fsimage files when 1) only local names are stored, 462 * and 2) snapshot is supported. 463 * 464 * @param numFiles number of files expected to be read 465 * @param in Image input stream 466 * @param counter Counter to increment for namenode startup progress 467 */ 468 private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in, 469 Counter counter) throws IOException { 470 assert NameNodeLayoutVersion.supports( 471 LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion()); 472 assert NameNodeLayoutVersion.supports( 473 LayoutVersion.Feature.SNAPSHOT, getLayoutVersion()); 474 475 // load root 476 loadRoot(in, counter); 477 // load rest of the nodes recursively 478 loadDirectoryWithSnapshot(in, counter); 479 } 480 481 /** 482 * load fsimage files assuming only local names are stored. Used when 483 * snapshots are not supported by the layout version. 484 * 485 * @param numFiles number of files expected to be read 486 * @param in image input stream 487 * @param counter Counter to increment for namenode startup progress 488 * @throws IOException 489 */ 490 private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter) 491 throws IOException { 492 assert NameNodeLayoutVersion.supports( 493 LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion()); 494 assert numFiles > 0; 495 496 // load root 497 loadRoot(in, counter); 498 // have loaded the first file (the root) 499 numFiles--; 500 501 // load rest of the nodes directory by directory 502 while (numFiles > 0) { 503 numFiles -= loadDirectory(in, counter); 504 } 505 if (numFiles != 0) { 506 throw new IOException("Read unexpect number of files: " + -numFiles); 507 } 508 } 509 510 /** 511 * Load information about root, and use the information to update the root 512 * directory of NameSystem. 513 * @param in The {@link DataInput} instance to read. 514 * @param counter Counter to increment for namenode startup progress 515 */ 516 private void loadRoot(DataInput in, Counter counter) 517 throws IOException { 518 // load root 519 if (in.readShort() != 0) { 520 throw new IOException("First node is not root"); 521 } 522 final INodeDirectory root = loadINode(null, false, in, counter) 523 .asDirectory(); 524 // update the root's attributes 525 updateRootAttr(root); 526 } 527 528 /** Load children nodes for the parent directory. */ 529 private int loadChildren(INodeDirectory parent, DataInput in, 530 Counter counter) throws IOException { 531 int numChildren = in.readInt(); 532 for (int i = 0; i < numChildren; i++) { 533 // load single inode 534 INode newNode = loadINodeWithLocalName(false, in, true, counter); 535 addToParent(parent, newNode); 536 } 537 return numChildren; 538 } 539 540 /** 541 * Load a directory when snapshot is supported. 542 * @param in The {@link DataInput} instance to read. 543 * @param counter Counter to increment for namenode startup progress 544 */ 545 private void loadDirectoryWithSnapshot(DataInput in, Counter counter) 546 throws IOException { 547 // Step 1. Identify the parent INode 548 long inodeId = in.readLong(); 549 final INodeDirectory parent = this.namesystem.dir.getInode(inodeId) 550 .asDirectory(); 551 552 // Check if the whole subtree has been saved (for reference nodes) 553 boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId()); 554 if (!toLoadSubtree) { 555 return; 556 } 557 558 // Step 2. Load snapshots if parent is snapshottable 559 int numSnapshots = in.readInt(); 560 if (numSnapshots >= 0) { 561 // load snapshots and snapshotQuota 562 SnapshotFSImageFormat.loadSnapshotList(parent, numSnapshots, in, this); 563 if (parent.getDirectorySnapshottableFeature().getSnapshotQuota() > 0) { 564 // add the directory to the snapshottable directory list in 565 // SnapshotManager. Note that we only add root when its snapshot quota 566 // is positive. 567 this.namesystem.getSnapshotManager().addSnapshottable(parent); 568 } 569 } 570 571 // Step 3. Load children nodes under parent 572 loadChildren(parent, in, counter); 573 574 // Step 4. load Directory Diff List 575 SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this); 576 577 // Recursively load sub-directories, including snapshot copies of deleted 578 // directories 579 int numSubTree = in.readInt(); 580 for (int i = 0; i < numSubTree; i++) { 581 loadDirectoryWithSnapshot(in, counter); 582 } 583 } 584 585 /** 586 * Load all children of a directory 587 * 588 * @param in input to load from 589 * @param counter Counter to increment for namenode startup progress 590 * @return number of child inodes read 591 * @throws IOException 592 */ 593 private int loadDirectory(DataInput in, Counter counter) throws IOException { 594 String parentPath = FSImageSerialization.readString(in); 595 // Rename .snapshot paths if we're doing an upgrade 596 parentPath = renameReservedPathsOnUpgrade(parentPath, getLayoutVersion()); 597 final INodeDirectory parent = INodeDirectory.valueOf( 598 namesystem.dir.getNode(parentPath, true), parentPath); 599 return loadChildren(parent, in, counter); 600 } 601 602 /** 603 * load fsimage files assuming full path names are stored 604 * 605 * @param numFiles total number of files to load 606 * @param in data input stream 607 * @param counter Counter to increment for namenode startup progress 608 * @throws IOException if any error occurs 609 */ 610 private void loadFullNameINodes(long numFiles, DataInput in, Counter counter) 611 throws IOException { 612 byte[][] pathComponents; 613 byte[][] parentPath = {{}}; 614 FSDirectory fsDir = namesystem.dir; 615 INodeDirectory parentINode = fsDir.rootDir; 616 for (long i = 0; i < numFiles; i++) { 617 pathComponents = FSImageSerialization.readPathComponents(in); 618 for (int j=0; j < pathComponents.length; j++) { 619 byte[] newComponent = renameReservedComponentOnUpgrade 620 (pathComponents[j], getLayoutVersion()); 621 if (!Arrays.equals(newComponent, pathComponents[j])) { 622 String oldPath = DFSUtil.byteArray2PathString(pathComponents); 623 pathComponents[j] = newComponent; 624 String newPath = DFSUtil.byteArray2PathString(pathComponents); 625 LOG.info("Renaming reserved path " + oldPath + " to " + newPath); 626 } 627 } 628 final INode newNode = loadINode( 629 pathComponents[pathComponents.length-1], false, in, counter); 630 631 if (isRoot(pathComponents)) { // it is the root 632 // update the root's attributes 633 updateRootAttr(newNode.asDirectory()); 634 continue; 635 } 636 637 namesystem.dir.addToInodeMap(newNode); 638 // check if the new inode belongs to the same parent 639 if(!isParent(pathComponents, parentPath)) { 640 parentINode = getParentINodeDirectory(pathComponents); 641 parentPath = getParent(pathComponents); 642 } 643 644 // add new inode 645 addToParent(parentINode, newNode); 646 } 647 } 648 649 private INodeDirectory getParentINodeDirectory(byte[][] pathComponents 650 ) throws FileNotFoundException, PathIsNotDirectoryException, 651 UnresolvedLinkException { 652 if (pathComponents.length < 2) { // root 653 return null; 654 } 655 // Gets the parent INode 656 final INodesInPath inodes = namesystem.dir.getExistingPathINodes( 657 pathComponents); 658 return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents); 659 } 660 661 /** 662 * Add the child node to parent and, if child is a file, update block map. 663 * This method is only used for image loading so that synchronization, 664 * modification time update and space count update are not needed. 665 */ 666 private void addToParent(INodeDirectory parent, INode child) { 667 FSDirectory fsDir = namesystem.dir; 668 if (parent == fsDir.rootDir) { 669 child.setLocalName(renameReservedRootComponentOnUpgrade( 670 child.getLocalNameBytes(), getLayoutVersion())); 671 } 672 // NOTE: This does not update space counts for parents 673 if (!parent.addChild(child)) { 674 return; 675 } 676 namesystem.dir.cacheName(child); 677 678 if (child.isFile()) { 679 updateBlocksMap(child.asFile()); 680 } 681 } 682 683 public void updateBlocksMap(INodeFile file) { 684 // Add file->block mapping 685 final BlockInfo[] blocks = file.getBlocks(); 686 if (blocks != null) { 687 final BlockManager bm = namesystem.getBlockManager(); 688 for (int i = 0; i < blocks.length; i++) { 689 file.setBlock(i, bm.addBlockCollection(blocks[i], file)); 690 } 691 } 692 } 693 694 /** @return The FSDirectory of the namesystem where the fsimage is loaded */ 695 public FSDirectory getFSDirectoryInLoading() { 696 return namesystem.dir; 697 } 698 699 public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in, 700 boolean updateINodeMap) throws IOException { 701 return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null); 702 } 703 704 public INode loadINodeWithLocalName(boolean isSnapshotINode, 705 DataInput in, boolean updateINodeMap, Counter counter) 706 throws IOException { 707 byte[] localName = FSImageSerialization.readLocalName(in); 708 localName = 709 renameReservedComponentOnUpgrade(localName, getLayoutVersion()); 710 INode inode = loadINode(localName, isSnapshotINode, in, counter); 711 if (updateINodeMap) { 712 namesystem.dir.addToInodeMap(inode); 713 } 714 return inode; 715 } 716 717 /** 718 * load an inode from fsimage except for its name 719 * 720 * @param in data input stream from which image is read 721 * @param counter Counter to increment for namenode startup progress 722 * @return an inode 723 */ 724 @SuppressWarnings("deprecation") 725 INode loadINode(final byte[] localName, boolean isSnapshotINode, 726 DataInput in, Counter counter) throws IOException { 727 final int imgVersion = getLayoutVersion(); 728 if (NameNodeLayoutVersion.supports( 729 LayoutVersion.Feature.SNAPSHOT, imgVersion)) { 730 namesystem.getFSDirectory().verifyINodeName(localName); 731 } 732 733 long inodeId = NameNodeLayoutVersion.supports( 734 LayoutVersion.Feature.ADD_INODE_ID, imgVersion) ? in.readLong() 735 : namesystem.allocateNewInodeId(); 736 737 final short replication = namesystem.getBlockManager().adjustReplication( 738 in.readShort()); 739 final long modificationTime = in.readLong(); 740 long atime = 0; 741 if (NameNodeLayoutVersion.supports( 742 LayoutVersion.Feature.FILE_ACCESS_TIME, imgVersion)) { 743 atime = in.readLong(); 744 } 745 final long blockSize = in.readLong(); 746 final int numBlocks = in.readInt(); 747 748 if (numBlocks >= 0) { 749 // file 750 751 // read blocks 752 BlockInfo[] blocks = new BlockInfo[numBlocks]; 753 for (int j = 0; j < numBlocks; j++) { 754 blocks[j] = new BlockInfo(replication); 755 blocks[j].readFields(in); 756 } 757 758 String clientName = ""; 759 String clientMachine = ""; 760 boolean underConstruction = false; 761 FileDiffList fileDiffs = null; 762 if (NameNodeLayoutVersion.supports( 763 LayoutVersion.Feature.SNAPSHOT, imgVersion)) { 764 // read diffs 765 fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this); 766 767 if (isSnapshotINode) { 768 underConstruction = in.readBoolean(); 769 if (underConstruction) { 770 clientName = FSImageSerialization.readString(in); 771 clientMachine = FSImageSerialization.readString(in); 772 // convert the last block to BlockUC 773 if (blocks.length > 0) { 774 BlockInfo lastBlk = blocks[blocks.length - 1]; 775 blocks[blocks.length - 1] = new BlockInfoUnderConstruction( 776 lastBlk, replication); 777 } 778 } 779 } 780 } 781 782 final PermissionStatus permissions = PermissionStatus.read(in); 783 784 // return 785 if (counter != null) { 786 counter.increment(); 787 } 788 789 final INodeFile file = new INodeFile(inodeId, localName, permissions, 790 modificationTime, atime, blocks, replication, blockSize, (byte)0); 791 if (underConstruction) { 792 file.toUnderConstruction(clientName, clientMachine); 793 } 794 return fileDiffs == null ? file : new INodeFile(file, fileDiffs); 795 } else if (numBlocks == -1) { 796 //directory 797 798 //read quotas 799 final long nsQuota = in.readLong(); 800 long dsQuota = -1L; 801 if (NameNodeLayoutVersion.supports( 802 LayoutVersion.Feature.DISKSPACE_QUOTA, imgVersion)) { 803 dsQuota = in.readLong(); 804 } 805 806 //read snapshot info 807 boolean snapshottable = false; 808 boolean withSnapshot = false; 809 if (NameNodeLayoutVersion.supports( 810 LayoutVersion.Feature.SNAPSHOT, imgVersion)) { 811 snapshottable = in.readBoolean(); 812 if (!snapshottable) { 813 withSnapshot = in.readBoolean(); 814 } 815 } 816 817 final PermissionStatus permissions = PermissionStatus.read(in); 818 819 //return 820 if (counter != null) { 821 counter.increment(); 822 } 823 final INodeDirectory dir = new INodeDirectory(inodeId, localName, 824 permissions, modificationTime); 825 if (nsQuota >= 0 || dsQuota >= 0) { 826 dir.addDirectoryWithQuotaFeature(nsQuota, dsQuota); 827 } 828 if (withSnapshot) { 829 dir.addSnapshotFeature(null); 830 } 831 if (snapshottable) { 832 dir.addSnapshottableFeature(); 833 } 834 return dir; 835 } else if (numBlocks == -2) { 836 //symlink 837 if (!FileSystem.areSymlinksEnabled()) { 838 throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS"); 839 } 840 841 final String symlink = Text.readString(in); 842 final PermissionStatus permissions = PermissionStatus.read(in); 843 if (counter != null) { 844 counter.increment(); 845 } 846 return new INodeSymlink(inodeId, localName, permissions, 847 modificationTime, atime, symlink); 848 } else if (numBlocks == -3) { 849 //reference 850 // Intentionally do not increment counter, because it is too difficult at 851 // this point to assess whether or not this is a reference that counts 852 // toward quota. 853 854 final boolean isWithName = in.readBoolean(); 855 // lastSnapshotId for WithName node, dstSnapshotId for DstReference node 856 int snapshotId = in.readInt(); 857 858 final INodeReference.WithCount withCount 859 = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this); 860 861 if (isWithName) { 862 return new INodeReference.WithName(null, withCount, localName, 863 snapshotId); 864 } else { 865 final INodeReference ref = new INodeReference.DstReference(null, 866 withCount, snapshotId); 867 return ref; 868 } 869 } 870 871 throw new IOException("Unknown inode type: numBlocks=" + numBlocks); 872 } 873 874 /** Load {@link INodeFileAttributes}. */ 875 public INodeFileAttributes loadINodeFileAttributes(DataInput in) 876 throws IOException { 877 final int layoutVersion = getLayoutVersion(); 878 879 if (!NameNodeLayoutVersion.supports( 880 LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) { 881 return loadINodeWithLocalName(true, in, false).asFile(); 882 } 883 884 final byte[] name = FSImageSerialization.readLocalName(in); 885 final PermissionStatus permissions = PermissionStatus.read(in); 886 final long modificationTime = in.readLong(); 887 final long accessTime = in.readLong(); 888 889 final short replication = namesystem.getBlockManager().adjustReplication( 890 in.readShort()); 891 final long preferredBlockSize = in.readLong(); 892 893 return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime, 894 accessTime, replication, preferredBlockSize, (byte) 0, null); 895 } 896 897 public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in) 898 throws IOException { 899 final int layoutVersion = getLayoutVersion(); 900 901 if (!NameNodeLayoutVersion.supports( 902 LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) { 903 return loadINodeWithLocalName(true, in, false).asDirectory(); 904 } 905 906 final byte[] name = FSImageSerialization.readLocalName(in); 907 final PermissionStatus permissions = PermissionStatus.read(in); 908 final long modificationTime = in.readLong(); 909 910 //read quotas 911 final long nsQuota = in.readLong(); 912 final long dsQuota = in.readLong(); 913 914 return nsQuota == -1L && dsQuota == -1L ? new INodeDirectoryAttributes.SnapshotCopy( 915 name, permissions, null, modificationTime, null) 916 : new INodeDirectoryAttributes.CopyWithQuota(name, permissions, 917 null, modificationTime, nsQuota, dsQuota, null); 918 } 919 920 private void loadFilesUnderConstruction(DataInput in, 921 boolean supportSnapshot, Counter counter) throws IOException { 922 FSDirectory fsDir = namesystem.dir; 923 int size = in.readInt(); 924 925 LOG.info("Number of files under construction = " + size); 926 927 for (int i = 0; i < size; i++) { 928 INodeFile cons = FSImageSerialization.readINodeUnderConstruction(in, 929 namesystem, getLayoutVersion()); 930 counter.increment(); 931 932 // verify that file exists in namespace 933 String path = cons.getLocalName(); 934 INodeFile oldnode = null; 935 boolean inSnapshot = false; 936 if (path != null && FSDirectory.isReservedName(path) && 937 NameNodeLayoutVersion.supports( 938 LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) { 939 // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in 940 // snapshot. If we support INode ID in the layout version, we can use 941 // the inode id to find the oldnode. 942 oldnode = namesystem.dir.getInode(cons.getId()).asFile(); 943 inSnapshot = true; 944 } else { 945 path = renameReservedPathsOnUpgrade(path, getLayoutVersion()); 946 final INodesInPath iip = fsDir.getLastINodeInPath(path); 947 oldnode = INodeFile.valueOf(iip.getINode(0), path); 948 } 949 950 FileUnderConstructionFeature uc = cons.getFileUnderConstructionFeature(); 951 oldnode.toUnderConstruction(uc.getClientName(), uc.getClientMachine()); 952 if (oldnode.numBlocks() > 0) { 953 BlockInfo ucBlock = cons.getLastBlock(); 954 // we do not replace the inode, just replace the last block of oldnode 955 BlockInfo info = namesystem.getBlockManager().addBlockCollection( 956 ucBlock, oldnode); 957 oldnode.setBlock(oldnode.numBlocks() - 1, info); 958 } 959 960 if (!inSnapshot) { 961 namesystem.leaseManager.addLease(cons 962 .getFileUnderConstructionFeature().getClientName(), path); 963 } 964 } 965 } 966 967 private void loadSecretManagerState(DataInput in) 968 throws IOException { 969 int imgVersion = getLayoutVersion(); 970 971 if (!NameNodeLayoutVersion.supports( 972 LayoutVersion.Feature.DELEGATION_TOKEN, imgVersion)) { 973 //SecretManagerState is not available. 974 //This must not happen if security is turned on. 975 return; 976 } 977 namesystem.loadSecretManagerStateCompat(in); 978 } 979 980 private void loadCacheManagerState(DataInput in) throws IOException { 981 int imgVersion = getLayoutVersion(); 982 if (!NameNodeLayoutVersion.supports( 983 LayoutVersion.Feature.CACHING, imgVersion)) { 984 return; 985 } 986 namesystem.getCacheManager().loadStateCompat(in); 987 } 988 989 private int getLayoutVersion() { 990 return namesystem.getFSImage().getStorage().getLayoutVersion(); 991 } 992 993 private boolean isRoot(byte[][] path) { 994 return path.length == 1 && 995 path[0] == null; 996 } 997 998 private boolean isParent(byte[][] path, byte[][] parent) { 999 if (path == null || parent == null) 1000 return false; 1001 if (parent.length == 0 || path.length != parent.length + 1) 1002 return false; 1003 boolean isParent = true; 1004 for (int i = 0; i < parent.length; i++) { 1005 isParent = isParent && Arrays.equals(path[i], parent[i]); 1006 } 1007 return isParent; 1008 } 1009 1010 /** 1011 * Return string representing the parent of the given path. 1012 */ 1013 String getParent(String path) { 1014 return path.substring(0, path.lastIndexOf(Path.SEPARATOR)); 1015 } 1016 1017 byte[][] getParent(byte[][] path) { 1018 byte[][] result = new byte[path.length - 1][]; 1019 for (int i = 0; i < result.length; i++) { 1020 result[i] = new byte[path[i].length]; 1021 System.arraycopy(path[i], 0, result[i], 0, path[i].length); 1022 } 1023 return result; 1024 } 1025 1026 public Snapshot getSnapshot(DataInput in) throws IOException { 1027 return snapshotMap.get(in.readInt()); 1028 } 1029 } 1030 1031 @VisibleForTesting 1032 public static final TreeMap<String, String> renameReservedMap = 1033 new TreeMap<String, String>(); 1034 1035 /** 1036 * Use the default key-value pairs that will be used to determine how to 1037 * rename reserved paths on upgrade. 1038 */ 1039 @VisibleForTesting 1040 public static void useDefaultRenameReservedPairs() { 1041 renameReservedMap.clear(); 1042 for (String key: HdfsConstants.RESERVED_PATH_COMPONENTS) { 1043 renameReservedMap.put( 1044 key, 1045 key + "." + HdfsConstants.NAMENODE_LAYOUT_VERSION + "." 1046 + "UPGRADE_RENAMED"); 1047 } 1048 } 1049 1050 /** 1051 * Set the key-value pairs that will be used to determine how to rename 1052 * reserved paths on upgrade. 1053 */ 1054 @VisibleForTesting 1055 public static void setRenameReservedPairs(String renameReserved) { 1056 // Clear and set the default values 1057 useDefaultRenameReservedPairs(); 1058 // Overwrite with provided values 1059 setRenameReservedMapInternal(renameReserved); 1060 } 1061 1062 private static void setRenameReservedMapInternal(String renameReserved) { 1063 Collection<String> pairs = 1064 StringUtils.getTrimmedStringCollection(renameReserved); 1065 for (String p : pairs) { 1066 String[] pair = StringUtils.split(p, '/', '='); 1067 Preconditions.checkArgument(pair.length == 2, 1068 "Could not parse key-value pair " + p); 1069 String key = pair[0]; 1070 String value = pair[1]; 1071 Preconditions.checkArgument(DFSUtil.isReservedPathComponent(key), 1072 "Unknown reserved path " + key); 1073 Preconditions.checkArgument(DFSUtil.isValidNameForComponent(value), 1074 "Invalid rename path for " + key + ": " + value); 1075 LOG.info("Will rename reserved path " + key + " to " + value); 1076 renameReservedMap.put(key, value); 1077 } 1078 } 1079 1080 /** 1081 * When upgrading from an old version, the filesystem could contain paths 1082 * that are now reserved in the new version (e.g. .snapshot). This renames 1083 * these new reserved paths to a user-specified value to avoid collisions 1084 * with the reserved name. 1085 * 1086 * @param path Old path potentially containing a reserved path 1087 * @return New path with reserved path components renamed to user value 1088 */ 1089 static String renameReservedPathsOnUpgrade(String path, 1090 final int layoutVersion) { 1091 final String oldPath = path; 1092 // If any known LVs aren't supported, we're doing an upgrade 1093 if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) { 1094 String[] components = INode.getPathNames(path); 1095 // Only need to worry about the root directory 1096 if (components.length > 1) { 1097 components[1] = DFSUtil.bytes2String( 1098 renameReservedRootComponentOnUpgrade( 1099 DFSUtil.string2Bytes(components[1]), 1100 layoutVersion)); 1101 path = DFSUtil.strings2PathString(components); 1102 } 1103 } 1104 if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) { 1105 String[] components = INode.getPathNames(path); 1106 // Special case the root path 1107 if (components.length == 0) { 1108 return path; 1109 } 1110 for (int i=0; i<components.length; i++) { 1111 components[i] = DFSUtil.bytes2String( 1112 renameReservedComponentOnUpgrade( 1113 DFSUtil.string2Bytes(components[i]), 1114 layoutVersion)); 1115 } 1116 path = DFSUtil.strings2PathString(components); 1117 } 1118 1119 if (!path.equals(oldPath)) { 1120 LOG.info("Upgrade process renamed reserved path " + oldPath + " to " 1121 + path); 1122 } 1123 return path; 1124 } 1125 1126 private final static String RESERVED_ERROR_MSG = 1127 FSDirectory.DOT_RESERVED_PATH_PREFIX + " is a reserved path and " 1128 + HdfsConstants.DOT_SNAPSHOT_DIR + " is a reserved path component in" 1129 + " this version of HDFS. Please rollback and delete or rename" 1130 + " this path, or upgrade with the " 1131 + StartupOption.RENAMERESERVED.getName() 1132 + " [key-value pairs]" 1133 + " option to automatically rename these paths during upgrade."; 1134 1135 /** 1136 * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single 1137 * byte array path component. 1138 */ 1139 private static byte[] renameReservedComponentOnUpgrade(byte[] component, 1140 final int layoutVersion) { 1141 // If the LV doesn't support snapshots, we're doing an upgrade 1142 if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) { 1143 if (Arrays.equals(component, HdfsConstants.DOT_SNAPSHOT_DIR_BYTES)) { 1144 Preconditions.checkArgument( 1145 renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR), 1146 RESERVED_ERROR_MSG); 1147 component = 1148 DFSUtil.string2Bytes(renameReservedMap 1149 .get(HdfsConstants.DOT_SNAPSHOT_DIR)); 1150 } 1151 } 1152 return component; 1153 } 1154 1155 /** 1156 * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single 1157 * byte array path component. 1158 */ 1159 private static byte[] renameReservedRootComponentOnUpgrade(byte[] component, 1160 final int layoutVersion) { 1161 // If the LV doesn't support inode IDs, we're doing an upgrade 1162 if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) { 1163 if (Arrays.equals(component, FSDirectory.DOT_RESERVED)) { 1164 Preconditions.checkArgument( 1165 renameReservedMap.containsKey(FSDirectory.DOT_RESERVED_STRING), 1166 RESERVED_ERROR_MSG); 1167 final String renameString = renameReservedMap 1168 .get(FSDirectory.DOT_RESERVED_STRING); 1169 component = 1170 DFSUtil.string2Bytes(renameString); 1171 LOG.info("Renamed root path " + FSDirectory.DOT_RESERVED_STRING 1172 + " to " + renameString); 1173 } 1174 } 1175 return component; 1176 } 1177 1178 /** 1179 * A one-shot class responsible for writing an image file. 1180 * The write() function should be called once, after which the getter 1181 * functions may be used to retrieve information about the file that was written. 1182 * 1183 * This is replaced by the PB-based FSImage. The class is to maintain 1184 * compatibility for the external fsimage tool. 1185 */ 1186 @Deprecated 1187 static class Saver { 1188 private static final int LAYOUT_VERSION = -51; 1189 private final SaveNamespaceContext context; 1190 /** Set to true once an image has been written */ 1191 private boolean saved = false; 1192 1193 /** The MD5 checksum of the file that was written */ 1194 private MD5Hash savedDigest; 1195 private final ReferenceMap referenceMap = new ReferenceMap(); 1196 1197 private final Map<Long, INodeFile> snapshotUCMap = 1198 new HashMap<Long, INodeFile>(); 1199 1200 /** @throws IllegalStateException if the instance has not yet saved an image */ 1201 private void checkSaved() { 1202 if (!saved) { 1203 throw new IllegalStateException("FSImageSaver has not saved an image"); 1204 } 1205 } 1206 1207 /** @throws IllegalStateException if the instance has already saved an image */ 1208 private void checkNotSaved() { 1209 if (saved) { 1210 throw new IllegalStateException("FSImageSaver has already saved an image"); 1211 } 1212 } 1213 1214 1215 Saver(SaveNamespaceContext context) { 1216 this.context = context; 1217 } 1218 1219 /** 1220 * Return the MD5 checksum of the image file that was saved. 1221 */ 1222 MD5Hash getSavedDigest() { 1223 checkSaved(); 1224 return savedDigest; 1225 } 1226 1227 void save(File newFile, FSImageCompression compression) throws IOException { 1228 checkNotSaved(); 1229 1230 final FSNamesystem sourceNamesystem = context.getSourceNamesystem(); 1231 final INodeDirectory rootDir = sourceNamesystem.dir.rootDir; 1232 final long numINodes = rootDir.getDirectoryWithQuotaFeature() 1233 .getSpaceConsumed().get(Quota.NAMESPACE); 1234 String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath(); 1235 Step step = new Step(StepType.INODES, sdPath); 1236 StartupProgress prog = NameNode.getStartupProgress(); 1237 prog.beginStep(Phase.SAVING_CHECKPOINT, step); 1238 prog.setTotal(Phase.SAVING_CHECKPOINT, step, numINodes); 1239 Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step); 1240 long startTime = now(); 1241 // 1242 // Write out data 1243 // 1244 MessageDigest digester = MD5Hash.getDigester(); 1245 FileOutputStream fout = new FileOutputStream(newFile); 1246 DigestOutputStream fos = new DigestOutputStream(fout, digester); 1247 DataOutputStream out = new DataOutputStream(fos); 1248 try { 1249 out.writeInt(LAYOUT_VERSION); 1250 LayoutFlags.write(out); 1251 // We use the non-locked version of getNamespaceInfo here since 1252 // the coordinating thread of saveNamespace already has read-locked 1253 // the namespace for us. If we attempt to take another readlock 1254 // from the actual saver thread, there's a potential of a 1255 // fairness-related deadlock. See the comments on HDFS-2223. 1256 out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo() 1257 .getNamespaceID()); 1258 out.writeLong(numINodes); 1259 out.writeLong(sourceNamesystem.getGenerationStampV1()); 1260 out.writeLong(sourceNamesystem.getGenerationStampV2()); 1261 out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch()); 1262 out.writeLong(sourceNamesystem.getLastAllocatedBlockId()); 1263 out.writeLong(context.getTxId()); 1264 out.writeLong(sourceNamesystem.getLastInodeId()); 1265 1266 1267 sourceNamesystem.getSnapshotManager().write(out); 1268 1269 // write compression info and set up compressed stream 1270 out = compression.writeHeaderAndWrapStream(fos); 1271 LOG.info("Saving image file " + newFile + 1272 " using " + compression); 1273 1274 // save the root 1275 saveINode2Image(rootDir, out, false, referenceMap, counter); 1276 // save the rest of the nodes 1277 saveImage(rootDir, out, true, false, counter); 1278 prog.endStep(Phase.SAVING_CHECKPOINT, step); 1279 // Now that the step is finished, set counter equal to total to adjust 1280 // for possible under-counting due to reference inodes. 1281 prog.setCount(Phase.SAVING_CHECKPOINT, step, numINodes); 1282 // save files under construction 1283 // TODO: for HDFS-5428, since we cannot break the compatibility of 1284 // fsimage, we store part of the under-construction files that are only 1285 // in snapshots in this "under-construction-file" section. As a 1286 // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their 1287 // paths, so that when loading fsimage we do not put them into the lease 1288 // map. In the future, we can remove this hack when we can bump the 1289 // layout version. 1290 sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap); 1291 1292 context.checkCancelled(); 1293 sourceNamesystem.saveSecretManagerStateCompat(out, sdPath); 1294 context.checkCancelled(); 1295 sourceNamesystem.getCacheManager().saveStateCompat(out, sdPath); 1296 context.checkCancelled(); 1297 out.flush(); 1298 context.checkCancelled(); 1299 fout.getChannel().force(true); 1300 } finally { 1301 out.close(); 1302 } 1303 1304 saved = true; 1305 // set md5 of the saved image 1306 savedDigest = new MD5Hash(digester.digest()); 1307 1308 LOG.info("Image file " + newFile + " of size " + newFile.length() + 1309 " bytes saved in " + (now() - startTime)/1000 + " seconds."); 1310 } 1311 1312 /** 1313 * Save children INodes. 1314 * @param children The list of children INodes 1315 * @param out The DataOutputStream to write 1316 * @param inSnapshot Whether the parent directory or its ancestor is in 1317 * the deleted list of some snapshot (caused by rename or 1318 * deletion) 1319 * @param counter Counter to increment for namenode startup progress 1320 * @return Number of children that are directory 1321 */ 1322 private int saveChildren(ReadOnlyList<INode> children, 1323 DataOutputStream out, boolean inSnapshot, Counter counter) 1324 throws IOException { 1325 // Write normal children INode. 1326 out.writeInt(children.size()); 1327 int dirNum = 0; 1328 int i = 0; 1329 for(INode child : children) { 1330 // print all children first 1331 // TODO: for HDFS-5428, we cannot change the format/content of fsimage 1332 // here, thus even if the parent directory is in snapshot, we still 1333 // do not handle INodeUC as those stored in deleted list 1334 saveINode2Image(child, out, false, referenceMap, counter); 1335 if (child.isDirectory()) { 1336 dirNum++; 1337 } else if (inSnapshot && child.isFile() 1338 && child.asFile().isUnderConstruction()) { 1339 this.snapshotUCMap.put(child.getId(), child.asFile()); 1340 } 1341 if (i++ % 50 == 0) { 1342 context.checkCancelled(); 1343 } 1344 } 1345 return dirNum; 1346 } 1347 1348 /** 1349 * Save file tree image starting from the given root. 1350 * This is a recursive procedure, which first saves all children and 1351 * snapshot diffs of a current directory and then moves inside the 1352 * sub-directories. 1353 * 1354 * @param current The current node 1355 * @param out The DataoutputStream to write the image 1356 * @param toSaveSubtree Whether or not to save the subtree to fsimage. For 1357 * reference node, its subtree may already have been 1358 * saved before. 1359 * @param inSnapshot Whether the current directory is in snapshot 1360 * @param counter Counter to increment for namenode startup progress 1361 */ 1362 private void saveImage(INodeDirectory current, DataOutputStream out, 1363 boolean toSaveSubtree, boolean inSnapshot, Counter counter) 1364 throws IOException { 1365 // write the inode id of the directory 1366 out.writeLong(current.getId()); 1367 1368 if (!toSaveSubtree) { 1369 return; 1370 } 1371 1372 final ReadOnlyList<INode> children = current 1373 .getChildrenList(Snapshot.CURRENT_STATE_ID); 1374 int dirNum = 0; 1375 List<INodeDirectory> snapshotDirs = null; 1376 DirectoryWithSnapshotFeature sf = current.getDirectoryWithSnapshotFeature(); 1377 if (sf != null) { 1378 snapshotDirs = new ArrayList<INodeDirectory>(); 1379 sf.getSnapshotDirectory(snapshotDirs); 1380 dirNum += snapshotDirs.size(); 1381 } 1382 1383 // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all 1384 // Snapshots 1385 if (current.isDirectory() && current.asDirectory().isSnapshottable()) { 1386 SnapshotFSImageFormat.saveSnapshots(current.asDirectory(), out); 1387 } else { 1388 out.writeInt(-1); // # of snapshots 1389 } 1390 1391 // 3. Write children INode 1392 dirNum += saveChildren(children, out, inSnapshot, counter); 1393 1394 // 4. Write DirectoryDiff lists, if there is any. 1395 SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap); 1396 1397 // Write sub-tree of sub-directories, including possible snapshots of 1398 // deleted sub-directories 1399 out.writeInt(dirNum); // the number of sub-directories 1400 for(INode child : children) { 1401 if(!child.isDirectory()) { 1402 continue; 1403 } 1404 // make sure we only save the subtree under a reference node once 1405 boolean toSave = child.isReference() ? 1406 referenceMap.toProcessSubtree(child.getId()) : true; 1407 saveImage(child.asDirectory(), out, toSave, inSnapshot, counter); 1408 } 1409 if (snapshotDirs != null) { 1410 for (INodeDirectory subDir : snapshotDirs) { 1411 // make sure we only save the subtree under a reference node once 1412 boolean toSave = subDir.getParentReference() != null ? 1413 referenceMap.toProcessSubtree(subDir.getId()) : true; 1414 saveImage(subDir, out, toSave, true, counter); 1415 } 1416 } 1417 } 1418 1419 /** 1420 * Saves inode and increments progress counter. 1421 * 1422 * @param inode INode to save 1423 * @param out DataOutputStream to receive inode 1424 * @param writeUnderConstruction boolean true if this is under construction 1425 * @param referenceMap ReferenceMap containing reference inodes 1426 * @param counter Counter to increment for namenode startup progress 1427 * @throws IOException thrown if there is an I/O error 1428 */ 1429 private void saveINode2Image(INode inode, DataOutputStream out, 1430 boolean writeUnderConstruction, ReferenceMap referenceMap, 1431 Counter counter) throws IOException { 1432 FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction, 1433 referenceMap); 1434 // Intentionally do not increment counter for reference inodes, because it 1435 // is too difficult at this point to assess whether or not this is a 1436 // reference that counts toward quota. 1437 if (!(inode instanceof INodeReference)) { 1438 counter.increment(); 1439 } 1440 } 1441 } 1442}