天天看点

openstack-nova源码分析(十一)rebuild重建

rebuild 重建:

虚拟重置为初始状态, 或者更换镜像等

一.API

API入口在nova/api/openstack/compute/servers.py

@wsgi.action('rebuild')
    @validation.schema(schema_server_rebuild_v20, '2.0', '2.0')
    @validation.schema(schema_server_rebuild, '2.1', '2.18')
    @validation.schema(schema_server_rebuild_v219, '2.19')
    def _action_rebuild(self, req, id, body):
        """Rebuild an instance with the given attributes."""
        rebuild_dict = body['rebuild']

				// 重建的镜像参数
        image_href = rebuild_dict["imageRef"]

				// 重建指定密码
        password = self._get_server_admin_password(rebuild_dict)

				// 校验instance 以及 校验是否允许rebuild权限
        context = req.environ['nova.context']
        instance = self._get_server(context, req, id)
        context.can(server_policies.SERVERS % 'rebuild',
                    target={'user_id': instance.user_id,
                            'project_id': instance.project_id})
        attr_map = {
            'name': 'display_name',
            'description': 'display_description',
            'metadata': 'metadata',
        }

        kwargs = {}

        helpers.translate_attributes(helpers.REBUILD, rebuild_dict, kwargs)

        for request_attribute, instance_attribute in attr_map.items():
            try:
                if request_attribute == 'name':
                    kwargs[instance_attribute] = common.normalize_name(
                        rebuild_dict[request_attribute])
                else:
                    kwargs[instance_attribute] = rebuild_dict[
                        request_attribute]
            except (KeyError, TypeError):
                pass

        try:
        
        	// 重建
            self.compute_api.rebuild(context,
                                     instance,
                                     image_href,
                                     password,
                                     **kwargs)
        except exception.InstanceIsLocked as e:
            raise exc.HTTPConflict(explanation=e.format_message())
        except exception.InstanceInvalidState as state_error:
            common.raise_http_conflict_for_instance_invalid_state(state_error,
                    'rebuild', id)
                    
				......

        instance = self._get_server(context, req, id, is_detail=True)

        view = self._view_builder.show(req, instance, extend_address=False)

        # Add on the admin_password attribute since the view doesn't do it
        # unless instance passwords are disabled
        if CONF.api.enable_instance_password:
            view['server']['adminPass'] = password

        robj = wsgi.ResponseObject(view)
        return self._add_location(robj)

           
二. compute rebuild

compute_api.rebuild 定义在nova/compute/api.py

该函数为一个长函数, 这里进行拆分查看

@check_instance_lock
    @check_instance_cell
    
    // 校验虚拟机状态, 只允许在指定状态下进行
    @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
                                    vm_states.ERROR])
    def rebuild(self, context, instance, image_href, admin_password,
                files_to_inject=None, **kwargs):


				// 获取参数, 并进行参数校验
        files_to_inject = files_to_inject or []
        metadata = kwargs.get('metadata', {})
        preserve_ephemeral = kwargs.get('preserve_ephemeral', False)
        auto_disk_config = kwargs.get('auto_disk_config')

        image_id, image = self._get_image(context, image_href)
        self._check_auto_disk_config(image=image, **kwargs)

        flavor = instance.get_flavor()
        
        
        获取磁盘的mapping信息
        bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
            context, instance.uuid)
        root_bdm = compute_utils.get_root_bdm(context, instance, bdms)


        # Check to see if the image is changing and we have a volume-backed
        # server.
        is_volume_backed = compute_utils.is_volume_backed_instance(
            context, instance, bdms)
        if is_volume_backed:
            # For boot from volume, instance.image_ref is empty, so we need to
            # query the image from the volume.
            if root_bdm is None:
                # This shouldn't happen and is an error, we need to fail. This
                # is not the users fault, it's an internal error. Without a
                # root BDM we have no way of knowing the backing volume (or
                # image in that volume) for this instance.
                raise exception.NovaException(
                    _('Unable to find root block device mapping for '
                      'volume-backed instance.'))

            volume = self.volume_api.get(context, root_bdm.volume_id)
            volume_image_metadata = volume.get('volume_image_metadata', {})
            orig_image_ref = volume_image_metadata.get('image_id')
        else:
            orig_image_ref = instance.image_ref


				//  校验参数, 包含inject的参数
        self._checks_for_create_and_rebuild(context, image_id, image,
                flavor, metadata, files_to_inject, root_bdm)

        kernel_id, ramdisk_id = self._handle_kernel_and_ramdisk(
                context, None, None, image)


           

重置磁盘的metadata, 移除旧数据

def _reset_image_metadata():
            """Remove old image properties that we're storing as instance
            system metadata.  These properties start with 'image_'.
            Then add the properties for the new image.
            """
            # FIXME(comstud): There's a race condition here in that if
            # the system_metadata for this instance is updated after
            # we do the previous save() and before we update.. those
            # other updates will be lost. Since this problem exists in
            # a lot of other places, I think it should be addressed in
            # a DB layer overhaul.

            orig_sys_metadata = dict(instance.system_metadata)
            # Remove the old keys
            for key in list(instance.system_metadata.keys()):
                if key.startswith(utils.SM_IMAGE_PROP_PREFIX):
                    del instance.system_metadata[key]

            # Add the new ones
            new_sys_metadata = utils.get_system_metadata_from_image(
                image, flavor)

            instance.system_metadata.update(new_sys_metadata)
            instance.save()
            return orig_sys_metadata
           

更新instance状态

// 更新instance状态

        # Since image might have changed, we may have new values for
        # os_type, vm_mode, etc
        options_from_image = self._inherit_properties_from_image(
                image, auto_disk_config)
        instance.update(options_from_image)

        instance.task_state = task_states.REBUILDING
        instance.image_ref = image_href
        instance.kernel_id = kernel_id or ""
        instance.ramdisk_id = ramdisk_id or ""
        instance.progress = 0
        instance.update(kwargs)
        instance.save(expected_task_state=[None])

           
orig_sys_metadata = _reset_image_metadata()

        self._record_action_start(context, instance, instance_actions.REBUILD)

				//  获取,或调整request_spec 调度参数
        host = instance.host
        try:
            request_spec = objects.RequestSpec.get_by_instance_uuid(
                context, instance.uuid)
            # If a new image is provided on rebuild, we will need to run
            # through the scheduler again, but we want the instance to be
            # rebuilt on the same host it's already on.
            if orig_image_ref != image_href:
                # We have to modify the request spec that goes to the scheduler
                # to contain the new image. We persist this since we've already
                # changed the instance.image_ref above so we're being
                # consistent.
                request_spec.image = objects.ImageMeta.from_dict(image)
                request_spec.save()
                if 'scheduler_hints' not in request_spec:
                    request_spec.scheduler_hints = {}
                # Nuke the id on this so we can't accidentally save
                # this hint hack later
                del request_spec.id

                # NOTE(danms): Passing host=None tells conductor to
                # call the scheduler. The _nova_check_type hint
                # requires that the scheduler returns only the same
                # host that we are currently on and only checks
                # rebuild-related filters.
                
                
                
                // 镜像不同时,将重新进行一次调度, 调度的主机,依旧在原宿主机上
                request_spec.scheduler_hints['_nova_check_type'] = ['rebuild']
                request_spec.force_hosts = [instance.host]
                request_spec.force_nodes = [instance.node]
                host = None
        except exception.RequestSpecNotFound:
            # Some old instances can still have no RequestSpec object attached
            # to them, we need to support the old way
            request_spec = None

           

重建

self.compute_task_api.rebuild_instance(context, instance=instance,
                new_pass=admin_password, injected_files=files_to_inject,
                image_ref=image_href, orig_image_ref=orig_image_ref,
                orig_sys_metadata=orig_sys_metadata, bdms=bdms,
                preserve_ephemeral=preserve_ephemeral, host=host,
                request_spec=request_spec,
                kwargs=kwargs)


           

实际调用conductor 的rebuild_instance

三. conductor rebuild

rebuild_instance 定义在nova/conductor/manager.py

def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
                         injected_files, new_pass, orig_sys_metadata,
                         bdms, recreate, on_shared_storage,
                         preserve_ephemeral=False, host=None,
                         request_spec=None):

        with compute_utils.EventReporter(context, 'rebuild_server',
                                          instance.uuid):
            node = limits = None

            try:
                migration = objects.Migration.get_by_instance_and_status(
                    context, instance.uuid, 'accepted')
            except exception.MigrationNotFoundByStatus:
                LOG.debug("No migration record for the rebuild/evacuate "
                          "request.", instance=instance)
                migration = None



						// 如果没有指定主机, 则 重新调度
            if not host:
                if not request_spec:
                    image_meta = nova_object.obj_to_primitive(
                        instance.image_meta)
                    request_spec = scheduler_utils.build_request_spec(
                            context, image_meta, [instance])
                elif recreate:

                    request_spec.ignore_hosts = request_spec.ignore_hosts or []
                    request_spec.ignore_hosts.append(instance.host)

                    request_spec.reset_forced_destinations()
                    filter_properties = request_spec.\
                        to_legacy_filter_properties_dict()
                    request_spec = request_spec.to_legacy_request_spec_dict()
                else:
                    filter_properties = request_spec. \
                        to_legacy_filter_properties_dict()
                    request_spec = request_spec.to_legacy_request_spec_dict()
                try:
                
                
                //  调度新主机, 之前使用了force host, 因此还是会分配到原来的属主机上
                    hosts = self._schedule_instances(
                            context, request_spec, filter_properties)
                    host_dict = hosts.pop(0)
                    host, node, limits = (host_dict['host'],
                                          host_dict['nodename'],
                                          host_dict['limits'])
                except exception.NoValidHost as ex:
										......

            compute_utils.notify_about_instance_usage(
                self.notifier, context, instance, "rebuild.scheduled")
                
                
                
						调用目标段宿主机的rebuild_instance
            self.compute_rpcapi.rebuild_instance(context,
                    instance=instance,
                    new_pass=new_pass,
                    injected_files=injected_files,
                    image_ref=image_ref,
                    orig_image_ref=orig_image_ref,
                    orig_sys_metadata=orig_sys_metadata,
                    bdms=bdms,
                    recreate=recreate,
                    on_shared_storage=on_shared_storage,
                    preserve_ephemeral=preserve_ephemeral,
                    migration=migration,
                    host=host, node=node, limits=limits)

           
四. compute rebuild_instance

compute_rpcapi.rebuild_instance 定义在 nova/compute/manager.py

定义:

@wrap_instance_event(prefix='compute')
    @wrap_instance_fault
    def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
                         injected_files, new_pass, orig_sys_metadata,
                         bdms, recreate, on_shared_storage=None,
                         preserve_ephemeral=False, migration=None,
                         scheduled_node=None, limits=None):
   
   
        context = context.elevated()

        LOG.info(_LI("Rebuilding instance"), instance=instance)

	// 是否重新生成, 这里为False, 忽略该部分即可
        if recreate:

            rt = self._get_resource_tracker()
            rebuild_claim = rt.rebuild_claim
        else:
            rebuild_claim = claims.NopClaim

        image_meta = {}
        if image_ref:
            image_meta = self.image_api.get(context, image_ref)



// 确认主机, 如果没有,则使用instance的host, 保持宿主机不变
        if not scheduled_node:
            if recreate:
                try:
                    compute_node = self._get_compute_info(context, self.host)
                    scheduled_node = compute_node.hypervisor_hostname
                except exception.ComputeHostNotFound:
                    LOG.exception(_LE('Failed to get compute_info for %s'),
                                  self.host)
            else:
                scheduled_node = instance.node

        with self._error_out_instance_on_exception(context, instance):
            try:
            
            //  重建
                claim_ctxt = rebuild_claim(
                    context, instance, scheduled_node,
                    limits=limits, image_meta=image_meta,
                    migration=migration)
                self._do_rebuild_instance_with_claim(
                    claim_ctxt, context, instance, orig_image_ref,
                    image_ref, injected_files, new_pass, orig_sys_metadata,
                    bdms, recreate, on_shared_storage, preserve_ephemeral,
                    migration)
            except exception.ComputeResourcesUnavailable as e:
								.....
            else:
                instance.apply_migration_context()
                # NOTE (ndipanov): This save will now update the host and node
                # attributes making sure that next RT pass is consistent since
                # it will be based on the instance and not the migration DB
                # entry.
                instance.host = self.host
                instance.node = scheduled_node
                instance.save()
                instance.drop_migration_context()

                # NOTE (ndipanov): Mark the migration as done only after we
                # mark the instance as belonging to this host.
                self._set_migration_status(migration, 'done')

           

_do_rebuild_instance_with_claim

定义:

def _do_rebuild_instance_with_claim(self, claim_context, *args, **kwargs):
        """Helper to avoid deep nesting in the top-level method."""

        with claim_context:
            self._do_rebuild_instance(*args, **kwargs)

           
def _do_rebuild_instance(self, context, instance, orig_image_ref,
                             image_ref, injected_files, new_pass,
                             orig_sys_metadata, bdms, recreate,
                             on_shared_storage, preserve_ephemeral,
                             migration):
        orig_vm_state = instance.vm_state


// recreate 该部分可以直接忽略
        if recreate:
					 ....

        if image_ref:
            image_meta = objects.ImageMeta.from_image_ref(
                context, self.image_api, image_ref)
        else:
            image_meta = instance.image_meta


// 获取磁盘信息
        orig_image_ref_url = glance.generate_image_url(orig_image_ref)
        extra_usage_info = {'image_ref_url': orig_image_ref_url}
        compute_utils.notify_usage_exists(
                self.notifier, context, instance,
                current_period=True, system_metadata=orig_sys_metadata,
                extra_usage_info=extra_usage_info)

        # This message should contain the new image_ref
        extra_usage_info = {'image_name': self._get_image_name(image_meta)}
        self._notify_about_instance_usage(context, instance,
                "rebuild.start", extra_usage_info=extra_usage_info)

//  更新虚拟机状态
        instance.power_state = self._get_power_state(context, instance)
        instance.task_state = task_states.REBUILDING
        instance.save(expected_task_state=[task_states.REBUILDING])

        if recreate:
						...
        else:
            network_info = compute_utils.get_nw_info_for_instance(instance)

        if bdms is None:
            bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                    context, instance.uuid)

        block_device_info = \
            self._get_instance_block_device_info(
                    context, instance, bdms=bdms)

        def detach_block_devices(context, bdms):
            for bdm in bdms:
                if bdm.is_volume:
                    self._detach_volume(context, bdm.volume_id, instance,
                                        destroy_bdm=False)

        files = self._decode_files(injected_files)

        kwargs = dict(
            context=context,
            instance=instance,
            image_meta=image_meta,
            injected_files=files,
            admin_password=new_pass,
            bdms=bdms,
            detach_block_devices=detach_block_devices,
            attach_block_devices=self._prep_block_device,
            block_device_info=block_device_info,
            network_info=network_info,
            preserve_ephemeral=preserve_ephemeral,
            recreate=recreate)
        try:
        
        //  调用libvirt 重建
            with instance.mutated_migration_context():
                self.driver.rebuild(**kwargs)
        except NotImplementedError:
            # NOTE(rpodolyaka): driver doesn't provide specialized version
            # of rebuild, fall back to the default implementation
            
            //  实际使用该过程进行重建
            self._rebuild_default_impl(**kwargs)
 
 
//  更新虚拟机状态及事件           
            
        self._update_instance_after_spawn(context, instance)
        instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])


        if orig_vm_state == vm_states.STOPPED:
            LOG.info(_LI("bringing vm to original state: '%s'"),
                        orig_vm_state, instance=instance)
            instance.vm_state = vm_states.ACTIVE
            instance.task_state = task_states.POWERING_OFF
            instance.progress = 0
            instance.save()
            self.stop_instance(context, instance, False)
        self._update_scheduler_instance_info(context, instance)
        self._notify_about_instance_usage(
                context, instance, "rebuild.end",
                network_info=network_info,
                extra_usage_info=extra_usage_info)
           

注意:

self.driver.rebuild(**kwargs) libvirt

并没有提供rebuild的方法

因此实际使用

self._rebuild_default_impl(**kwargs)

def _rebuild_default_impl(self, context, instance, image_meta,
                              injected_files, admin_password, bdms,
                              detach_block_devices, attach_block_devices,
                              network_info=None,
                              recreate=False, block_device_info=None,
                              preserve_ephemeral=False):
        if preserve_ephemeral:
            # The default code path does not support preserving ephemeral
            # partitions.
            raise exception.PreserveEphemeralNotSupported()

        if recreate:
            detach_block_devices(context, bdms)
        else:
        
 //  关机       
            self._power_off_instance(context, instance, clean_shutdown=True)
//  卸载磁盘            
            detach_block_devices(context, bdms)
            
//  销毁虚拟机            
            self.driver.destroy(context, instance,
                                network_info=network_info,
                                block_device_info=block_device_info)
                                
 //  更新主机的状态                               

        instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
        instance.save(expected_task_state=[task_states.REBUILDING])

        new_block_device_info = attach_block_devices(context, instance, bdms)

        instance.task_state = task_states.REBUILD_SPAWNING
        instance.save(
            expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])

// 使用spawn 创建虚拟机
        with instance.mutated_migration_context():
            self.driver.spawn(context, instance, image_meta, injected_files,
                              admin_password, network_info=network_info,
                              block_device_info=new_block_device_info)

           
五.spawn 创建虚拟机

driver.spawn 实际使用的就是虚拟机创建过程

源码定义在nova/libvirt/driver.py

创建过程定义如下:

# NOTE(ilyaalekseyev): Implementation like in multinics
    # for xenapi(tr3buchet)
    def spawn(self, context, instance, image_meta, injected_files,
              admin_password, network_info=None, block_device_info=None):
        disk_info = blockinfo.get_disk_info(CONF.libvirt.virt_type,
                                            instance,
                                            image_meta,
                                            block_device_info)
        injection_info = InjectionInfo(network_info=network_info,
                                       files=injected_files,
                                       admin_pass=admin_password)
        gen_confdrive = functools.partial(self._create_configdrive,
                                          context, instance,
                                          injection_info)
        self._create_image(context, instance, disk_info['mapping'],
                           injection_info=injection_info,
                           block_device_info=block_device_info)

        # Required by Quobyte CI
        self._ensure_console_log_for_instance(instance)

				
        xml = self._get_guest_xml(context, instance, network_info,
                                  disk_info, image_meta,
                                  block_device_info=block_device_info)
        self._create_domain_and_network(
            context, xml, instance, network_info, disk_info,
            block_device_info=block_device_info,
            post_xml_callback=gen_confdrive,
            destroy_disks_on_failure=True)
        LOG.debug("Instance is running", instance=instance)

        def _wait_for_boot():
            """Called at an interval until the VM is running."""
            state = self.get_info(instance).state

            if state == power_state.RUNNING:
                LOG.info(_LI("Instance spawned successfully."),
                         instance=instance)
                raise loopingcall.LoopingCallDone()

        timer = loopingcall.FixedIntervalLoopingCall(_wait_for_boot)
        timer.start(interval=0.5).wait()

           

该部分与虚拟机创建,一致, 这里不做深入分析

继续阅读