基于OpenStack Queens版
一、建立vGPU虛機流程簡述
[nova-api程序]
1、nova/api/openstack/compute/servers.py create()
2、nova/compute/api.py create() 調用同檔案中 _create_instance()
3、nova/conductor/api.py
build_instances() --> nova\conductor\rpcapi.py build_instances() rpc遠端調用
[nova-conductor程序]
4、nova/conductor/manager.py
build_instances()通過self.compute_rpcapi.build_and_run_instance --> rpc遠端調用nova/compute/rpcapi.py的build_and_run_instance()
[nova-compute程序]
5、nova/compute/manager.py
build_and_run_instance() --> _do_build_and_run_instance() --> _build_and_run_instance(),調用_build_resources建立對應(vGPU)資源,然後調spawn()方法,建立libvirt執行個體。
二、spawn方法建立vGPU執行個體入口函數
1、傳入包含vGPU資訊的allocations參數到_allocate_mdevs方法,生成并傳回mdev裝置(allocations表示配置設定給此instance的資源,通過placement api擷取)
2、_get_guest_xml()方法傳入mdevs參數,生成執行個體xml
3、調用libvirt API建立執行個體
# nova\virt\libvirt\driver.py
def spawn(self, context, instance, image_meta, injected_files,
admin_password, allocations, network_info=None,
block_device_info=None):
mdevs = self._allocate_mdevs(allocations)
xml = self._get_guest_xml(context, instance, network_info,
disk_info, image_meta,
block_device_info=block_device_info,
mdevs=mdevs)
self._create_domain_and_network(
context, xml, instance, network_info,
block_device_info=block_device_info,
post_xml_callback=gen_confdrive,
destroy_disks_on_failure=True,
power_on=power_on)
三、spawn方法中,vGPU相關變量allocations如何生成
傳入vGPU資源變量,通過REST方式,調用placement API擷取計算節點對應vGPU資源
1,nova\compute\manager.py
def _build_and_run_instance():
try:
with self._build_resources(context, instance, requested_networks, security_groups, image_meta, block_device_mapping) as resources:
self.driver.spawn(context, instance, image_meta, injected_files, admin_password, allocs, network_info=network_info, block_device_info=block_device_info)
def _build_resources():
try:
resources['allocations'] = (self.reportclient.get_allocations_for_consumer(context,instance.uuid))
2、 nova/scheduler/client/report.py
# 此處get_allocations_for_consumer調用placement API,通過REST方式,擷取allocations資源(其中包含vGPU資源)
def get_allocations_for_consumer(self, context, consumer):
url = '/allocations/%s' % consumer
resp = self.get(url, global_request_id=context.global_id)
if not resp:
return {}
else:
return resp.json()['allocations']
四、allocate_mdevs方法擷取mdev裝置
1、allocate_mdevs方法,流程概覽
1)_vgpu_allocations方法,過濾隻與vGPU相關的allocations資訊
2)_get_supported_vgpu_types方法,從nova配置檔案讀取目前節點支援的vgpu類型,對應參數enabled_vgpu_types
3)_get_existing_mdevs_not_assigned 方法,擷取目前節點可用的mdev裝置,即已建立成功,但未配置設定出去的mdev裝置清單。若能擷取到有效的mdev裝置,則傳回裝置uuid,否則執行下一步建立mdev裝置。
4)_create_new_mediated_device方法,根據實體裝置,建立新的mdev裝置。适用于初始化mdev裝置使用,以及後續新增GPU裝置的場景。
調用流程圖:
代碼實作入口:
@utils.synchronized(VGPU_RESOURCE_SEMAPHORE)
def _allocate_mdevs(self, allocations):
# 傳回與可用資源相對應的中介裝置uuid清單,我們可以将這些可用資源配置設定給 通過傳遞參數請求配置設定相對應資源的客戶。
# 該方法也可以找到一個現有的但未配置設定的中介裝置,如果有足夠的剩餘容量,也可以建立一個新的中介裝置實體裝置。
vgpu_allocations = self._vgpu_allocations(allocations)
if not vgpu_allocations:
return
if len(vgpu_allocations) > 1:
LOG.warning('More than one allocation was passed over to libvirt '
'while at the moment libvirt only supports one. Only '
'the first allocation will be looked up.')
alloc = six.next(six.itervalues(vgpu_allocations))
vgpus_asked = alloc['resources'][fields.ResourceClass.VGPU]
# 從nova配置檔案中,讀取CONF.devices.enabled_vgpu_types的第一個值。
requested_types = self._get_supported_vgpu_types()
# Which mediated devices are created but not assigned to a guest ?
mdevs_available = self._get_existing_mdevs_not_assigned(
requested_types)
chosen_mdevs = []
for c in six.moves.range(vgpus_asked):
chosen_mdev = None
if mdevs_available:
# Take the first available mdev
chosen_mdev = mdevs_available.pop()
else:
chosen_mdev = self._create_new_mediated_device(requested_types)
if not chosen_mdev:
# If we can't find devices having available VGPUs, just raise
raise exception.ComputeResourcesUnavailable(
reason='vGPU resource is not available')
else:
chosen_mdevs.append(chosen_mdev)
return chosen_mdevs
2、_vgpu_allocations方法分析
_vgpu_allocations方法,過濾allocations,過濾allocations擷取vGPU相關請求
@staticmethod
def _vgpu_allocations(allocations):
if not allocations:
# If no allocations, there is no vGPU request.
return {}
RC_VGPU = fields.ResourceClass.VGPU
vgpu_allocations = {}
for rp in allocations:
res = allocations[rp]['resources']
if RC_VGPU in res and res[RC_VGPU] > 0:
vgpu_allocations[rp] = {'resources': {RC_VGPU: res[RC_VGPU]}}
return vgpu_allocations
3、_get_supported_vgpu_types方法分析
_get_supported_vgpu_types方法,讀取計算節點nova配置檔案中支援的vGPU類型
# 從nova配置檔案中,讀取CONF.devices.enabled_vgpu_types的第一個值,隻支援一種類型,例如nvidia-319
def _get_supported_vgpu_types(self):
if not CONF.devices.enabled_vgpu_types:
return []
# TODO(sbauza): Move this check up to compute_manager.init_host
if len(CONF.devices.enabled_vgpu_types) > 1:
LOG.warning('libvirt only supports one GPU type per compute node,'
' only first type will be used.')
requested_types = CONF.devices.enabled_vgpu_types[:1]
return requested_types
4、_get_existing_mdevs_not_assigned 方法分析
_get_existing_mdevs_not_assigned 方法,調用libvirt接口,擷取目前節點可用的mdev裝置
擷取未配置設定狀态的mdev裝置,為下一步建立mdev裝置做準備:
第一步,從所有虛機中,檢視已配置設定出去的mdev裝置清單;
第二步,查詢目前節點上所有mdev裝置清單;
第三步,所有的裝置清單,減去已配置設定的裝置清單,就是可用的裝置清單,available_mdevs。
def _get_existing_mdevs_not_assigned(self, requested_types=None):
allocated_mdevs = self._get_all_assigned_mediated_devices()
mdevs = self._get_mediated_devices(requested_types)
available_mdevs = set([mdev["uuid"]
for mdev in mdevs]) - set(allocated_mdevs)
return available_mdevs
4.1 _get_all_assigned_mediated_devices方法,調用libvirt接口,擷取計算節點所有虛機中已配置設定出去的所有mdev裝置,以字典格式傳回。
def _get_all_assigned_mediated_devices(self, instance=None):
allocated_mdevs = {}
# 暫不考慮指定instance的情況
if instance:
try:
guest = self._host.get_guest(instance)
except exception.InstanceNotFound:
return {}
guests = [guest]
else:
# 調用libvirt接口,擷取所有虛機
guests = self._host.list_guests(only_running=False)
for guest in guests:
# 周遊所有guest,查詢guest(XML配置)的devices清單中,是否有mdev裝置,若有,則記錄虛機的uuid到allocated_mdevs字典。字典格式{mdev裝置uuid:虛機uuid}
cfg = guest.get_config()
for device in cfg.devices:
if isinstance(device, vconfig.LibvirtConfigGuestHostdevMDEV):
allocated_mdevs[device.uuid] = guest.uuid
return allocated_mdevs
4.2 _get_mediated_devices方法,根據nova配置,過濾指定vGPU類型的mdev裝置
# 擷取主機mdev裝置。從libvirt擷取與nova配置CONF.devices.enabled_vgpu_types比對的所有mdev裝置資訊,并以清單的格式傳回。
def _get_mediated_devices(self, types=None):
if not self._host.has_min_version(MIN_LIBVIRT_MDEV_SUPPORT):
return []
dev_names = self._host.list_mediated_devices() or []
mediated_devices = []
for name in dev_names:
device = self._get_mediated_device_information(name)
if not types or device["type"] in types:
mediated_devices.append(device)
return mediated_devices
5、_create_new_mediated_device方法建立mdev裝置
# 找到一個可以支援新中介mediated裝置的實體裝置,建立mediated裝置。
def _create_new_mediated_device(self, requested_types, uuid=None):
# 周遊擷取所有可用的mdev裝置,傳回mdev裝置清單【以下for循環建立所有mdev裝置,其中chosen_mdev會有多次重新整理,最終隻傳回最後一個生成的chosen_mdev】
devices = self._get_mdev_capable_devices(requested_types)
for device in devices:
# 周遊目前所有可用的mdev裝置,調用create_mdev,注入uuid建立mdev裝置
asked_type = requested_types[0]
if device['types'][asked_type]['availableInstances'] > 0:
# That physical GPU has enough room for a new mdev
dev_name = device['dev_id']
# We need the PCI address, not the libvirt name
# The libvirt name is like 'pci_0000_84_00_0'
pci_addr = "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
chosen_mdev = nova.privsep.libvirt.create_mdev(pci_addr,
asked_type,
uuid=uuid)
return chosen_mdev
5.1 _get_mdev_capable_devices()方法,擷取支援mdev類型的主機實體裝置(pGPU卡)
def _get_mdev_capable_devices(self, types=None):
# 擷取支援mdev類型的主機裝置
if not self._host.has_min_version(MIN_LIBVIRT_MDEV_SUPPORT):
return []
dev_names = self._host.list_mdev_capable_devices() or []
mdev_capable_devices = []
for name in dev_names:
device = self._get_mdev_capabilities_for_dev(name, types)
if not device["types"]:
continue
mdev_capable_devices.append(device)
return mdev_capable_devices
#5.1.1 list_mdev_capable_devices()方法,調用libvirt接口,查找支援mdev功能的裝置
def list_mdev_capable_devices(self, flags=0):
"""Lookup devices supporting mdev capabilities.
:returns: a list of virNodeDevice instance
"""
return self._list_devices("mdev_types", flags=flags)
def _list_devices(self, cap, flags=0):
"""Lookup devices.
:returns: a list of virNodeDevice instance
"""
try:
return self.get_connection().listDevices(cap, flags)
except libvirt.libvirtError as ex:
error_code = ex.get_error_code()
if error_code == libvirt.VIR_ERR_NO_SUPPORT:
LOG.warning("URI %(uri)s does not support "
"listDevices: %(error)s",
{'uri': self._uri, 'error': ex})
return []
else:
raise
#5.1.2 _get_mdev_capabilities_for_dev方法,用于周遊GPU卡的過程中,提取有效的裝置資訊,并組合成清單形式傳回
# 最終單個GPU卡傳回一個具有MDEV功能裝置的dict,device = {"dev_id": cfgdev.name,"types": {.....}}
# 傳回一個具有MDEV功能裝置的dict,該裝置資訊的字典中,字典的第一組資訊為ID值的kv,字典的第二組資訊為受支援類型的清單,每個類型也都是dict字典類型。
def _get_mdev_capabilities_for_dev(self, devname, types=None):
"""Returns a dict of MDEV capable device with the ID as first key
and then a list of supported types, each of them being a dict.
:param types: Only return those specific types.
"""
virtdev = self._host.device_lookup_by_name(devname)
xmlstr = virtdev.XMLDesc(0)
cfgdev = vconfig.LibvirtConfigNodeDevice()
cfgdev.parse_str(xmlstr)
device = {
"dev_id": cfgdev.name,
"types": {},
}
for mdev_cap in cfgdev.pci_capability.mdev_capability:
for cap in mdev_cap.mdev_types:
if not types or cap['type'] in types:
device["types"].update({cap['type']: {
'availableInstances': cap['availableInstances'],
'name': cap['name'],
'deviceAPI': cap['deviceAPI']}})
return device
def device_lookup_by_name(self, name):
"""Lookup a node device by its name.
:returns: a virNodeDevice instance
"""
return self.get_connection().nodeDeviceLookupByName(name)
5.2 create_mdev()方法,指定實體裝置的mdev裝置,寫入随機uuid
通過向/sysfs/devices//mdev_supported_types//create輸入一個UUID,就可以建立一個類型的Mediated Device。
@nova.privsep.sys_admin_pctxt.entrypoint
def create_mdev(physical_device, mdev_type, uuid=None):
"""Instantiate a mediated device."""
if uuid is None:
uuid = uuidutils.generate_uuid()
fpath = '/sys/class/mdev_bus/{0}/mdev_supported_types/{1}/create'
fpath = fpath.format(physical_device, mdev_type)
with open(fpath, 'w') as f:
f.write(uuid)
return uuid
五、vGPU資源的釋放
虛機挂起、關機、删除,都會釋放vGPU資源
1、虛機删除操作,直接調用libvirt的destroy方法,之後vGPU資源釋放到資源池。
2、vGPU虛機的暫停、挂起操作,mdev裝置的處理
pause()方法調用suspend()方法,suspend調用_detach_mediated_devices()方法,執行detach_device,釋放mdev裝置
# nova\virt\libvirt\driver.py
def pause(self, instance):
"""Pause VM instance."""
self._host.get_guest(instance).pause()
# nova\virt\libvirt\guest.py
def pause(self):
self._domain.suspend()
# nova\virt\libvirt\driver.py
def suspend(self, context, instance):
"""Suspend the specified instance."""
guest = self._host.get_guest(instance)
self._detach_pci_devices(guest,pci_manager.get_instance_pci_devs(instance))
self._detach_direct_passthrough_ports(context, instance, guest)
self._detach_mediated_devices(guest)
guest.save_memory_state()
3、rescue方法,先擷取原來vGPU虛機對應mdev裝置資訊,再組裝xml建立執行個體
suspend對應rescue方法,先調用_get_all_assigned_mediated_devices()方法,擷取mdev清單,再調用_get_guest_xml()方法,把mdev裝置資訊組裝到xml中。
def rescue():
mdevs = self._get_all_assigned_mediated_devices(instance)
mdevs = list(mdevs.keys())
xml = self._get_guest_xml(context,...,mdevs=mdevs)
self._destroy(instance)
self._create_domain(xml, ...)
4、reboot和power_on,都有調用_hard_reboot方法
在_hard_reboot方法中,也是先調用_get_all_assigned_mediated_devices()方法,傳入instance參數,擷取執行個體對應的mdev清單,再調用_get_guest_xml()方法,把mdev裝置資訊組裝到xml中。
删除(挂起)執行個體時,會清除釋放mdev裝置;重新調用libivrt建立原來執行個體時,需要重新擷取執行個體銷毀之前使用的mdev裝置(清單),以便重用原來的mdev裝置。
def _hard_reboot():
mdevs = self._get_all_assigned_mediated_devices(instance)
mdevs = list(mdevs.keys())
self.destroy(...)
#傳入原來vm使用的mdev裝置,繼續建立libvirt執行個體
xml = self._get_guest_xml(context,...,mdevs=mdevs)
self._create_domain_and_network()
5、對比建立執行個體流程(先申請mdev裝置,再組裝xml)
def spawn():
mdevs = self._allocate_mdevs(allocations)
xml = self._get_guest_xml(context,....,mdevs=mdevs)
self._create_domain_and_network(context, xml,...)
六、vGPU執行個體遷移
目前冷熱遷移都不支援,可以先建立鏡像再建立執行個體,模拟冷遷移操作。