一、 Linux內(nèi)核對象與對象集二、 sysfs三、 設(shè)備模型四、 SCSI子系統(tǒng)五、 SCSI磁盤驅(qū)動sd
六、 SCSI Target--TCM
七、 用戶空間IO--UIO
八、 在用戶空間實(shí)現(xiàn)虛擬SCSI磁盤--TCMU
九、 通用塊層
十、文件系統(tǒng)--VFS
內(nèi)核對象作為Linux設(shè)備驅(qū)動模型的基礎(chǔ),主要是抽象和封裝總線、設(shè)備、驅(qū)動、類和接口之間的關(guān)系具體實(shí)現(xiàn)的相關(guān)代碼,并在sysfs中呈現(xiàn)。主要抽象成kobject和kset結(jié)構(gòu):
struct kobject { const char *name; //在sysfs中顯示的名稱 struct list_head entry; //鏈入kset的kobj鏈表 struct kobject *parent; //指向父kobject,用于表示樹形結(jié)構(gòu) struct kset *kset; //指向鏈入的kset struct kobj_type *ktype; //抽象kobject的通用方法和屬性 struct kernfs_node *sd; //sysfs directory entry struct kref kref; //引用計(jì)數(shù) #ifdef CONFIG_DEBUG_KOBJECT_RELEASE struct delayed_work release; #endif unsigned int state_initialized:1; //是否被初始化 unsigned int state_in_sysfs:1; //是否被添加到sysfs unsigned int state_add_uevent_sent:1; //是否發(fā)送ADD事件到用戶空間 unsigned int state_remove_uevent_sent:1; //是否發(fā)送REMOVE事件到用戶空間 unsigned int uevent_suppress:1; //事件是否被抑制};
在kobject結(jié)構(gòu)中ktype域是對kobject一些通用方法和屬性進(jìn)行封裝:
struct kobj_type { void (*release)(struct kobject *kobj); //釋放kobject結(jié)構(gòu)時(shí)回調(diào) const struct sysfs_ops *sysfs_ops; //sysfs的操作函數(shù) struct attribute **default_attrs; //默認(rèn)屬性 //命名空間相關(guān)操作 const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); const void *(*namespace)(struct kobject *kobj);};
kset是一組kobject的集合,通過kset可以遍歷這組kobject,如SCSI子系統(tǒng)中,設(shè)備是一種kobject,通過設(shè)備集kset,可以遍歷所有的設(shè)備。
/** * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem. * * A kset defines a group of kobjects. They can be individually * different "types" but overall these kobjects all want to be grouped * together and operated on in the same manner. ksets are used to * define the attribute callbacks and other common events that happen to * a kobject. * * @list: the list of all kobjects for this kset * @list_lock: a lock for iterating over the kobjects * @kobj: the embedded kobject for this kset (recursion, isn't it fun...) * @uevent_ops: the set of uevent operations for this kset. These are * called whenever a kobject has something happen to it so that the kset * can add new environment variables, or filter out the uevents if so * desired. */struct kset { struct list_head list; //鏈入kset的kobject鏈表 spinlock_t list_lock; //遍歷鏈表是的自旋鎖struct kobject kobj; //本身可以當(dāng)做kobject對待 const struct kset_uevent_ops *uevent_ops; //發(fā)送uevent事件的回調(diào)函數(shù)};
在發(fā)送事件到用戶空間時(shí),可以回調(diào)kset_uevent_ops中的3個(gè)回調(diào)函數(shù)
struct kset_uevent_ops { int (* const filter)(struct kset *kset, struct kobject *kobj); const char *(* const name)(struct kset *kset, struct kobject *kobj); int (* const uevent)(struct kset *kset, struct kobject *kobj, struct kobj_uevent_env *env);};
filter:在發(fā)送事件之前的過濾某些事件。
name: 獲取名稱。
uevent:設(shè)置uevent需要的環(huán)境變量。
void kobject_init(struct kobject *kobj, struct kobj_type *ktype);int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...);int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...);void kobject_del(struct kobject *kobj);struct kobject * kobject_create(void);struct kobject * kobject_create_and_add(const char *name, struct kobject *parent);int kobject_rename(struct kobject *, const char *new_name);int kobject_move(struct kobject *, struct kobject *);struct kobject *kobject_get(struct kobject *kobj);void kobject_put(struct kobject *kobj);const void *kobject_namespace(struct kobject *kobj);char *kobject_get_path(struct kobject *kobj, gfp_t flag);
初始化流程主要在kobject_init:
/** * kobject_init - initialize a kobject structure * @kobj: pointer to the kobject to initialize * @ktype: pointer to the ktype for this kobject. * * This function will properly initialize a kobject such that it can then * be passed to the kobject_add() call. * * After this function is called, the kobject MUST be cleaned up by a call * to kobject_put(), not by a call to kfree directly to ensure that all of * the memory is cleaned up properly. */void kobject_init(struct kobject *kobj, struct kobj_type *ktype){ char *err_str; if (!kobj) { err_str = "invalid kobject pointer!"; goto error; } if (!ktype) { err_str = "must have a ktype to be initialized properly!\n"; goto error; } if (kobj->state_initialized) { //避免重復(fù)初始化 /* do not error out as sometimes we can recover */ printk(KERN_ERR "kobject (%p): tried to init an initialized " "object, something is seriously wrong.\n", kobj); dump_stack(); } kobject_init_internal(kobj); //完成初始化的主要函數(shù) kobj->ktype = ktype; return;error: printk(KERN_ERR "kobject (%p): %s\n", kobj, err_str); dump_stack();}EXPORT_SYMBOL(kobject_init);
由上面函數(shù)可以看出由kobject_init_internal完成初始化:
static void kobject_init_internal(struct kobject *kobj){ if (!kobj) return; kref_init(&kobj->kref); INIT_LIST_HEAD(&kobj->entry); kobj->state_in_sysfs = 0; kobj->state_add_uevent_sent = 0; kobj->state_remove_uevent_sent = 0; kobj->state_initialized = 1;}
kobject_create函數(shù)僅僅是在調(diào)用kobject_init之前,先分配kobject空間。在kobject初始化之后,需要調(diào)用kobject_add將kobject添加到sysfs中。
/** * kobject_add - the main kobject add function * @kobj: the kobject to add * @parent: pointer to the parent of the kobject. * @fmt: format to name the kobject with. * * The kobject name is set and added to the kobject hierarchy in this * function. * * If @parent is set, then the parent of the @kobj will be set to it. * If @parent is NULL, then the parent of the @kobj will be set to the * kobject associated with the kset assigned to this kobject. If no kset * is assigned to the kobject, then the kobject will be located in the * root of the sysfs tree. * * If this function returns an error, kobject_put() must be called to * properly clean up the memory associated with the object. * Under no instance should the kobject that is passed to this function * be directly freed with a call to kfree(), that can leak memory. * * Note, no "add" uevent will be created with this call, the caller should set * up all of the necessary sysfs files for the object and then call * kobject_uevent() with the UEVENT_ADD parameter to ensure that * userspace is properly notified of this kobject's creation. */int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...){ va_list args; int retval; if (!kobj) return -EINVAL; if (!kobj->state_initialized) { //add之前需要初始化 printk(KERN_ERR "kobject '%s' (%p): tried to add an " "uninitialized object, something is seriously wrong.\n", kobject_name(kobj), kobj); dump_stack(); return -EINVAL; } va_start(args, fmt); retval = kobject_add_varg(kobj, parent, fmt, args); //主要完成add操作 va_end(args); return retval;}EXPORT_SYMBOL(kobject_add);
kobject_add_varg/kobject_add_internal主要完成將kobject添加到sysfs的操作:
static __printf(3, 0) int kobject_add_varg(struct kobject *kobj, struct kobject *parent, const char *fmt, va_list vargs){ int retval; //設(shè)置kobject在sysfs中顯示的名稱 retval = kobject_set_name_vargs(kobj, fmt, vargs); if (retval) { printk(KERN_ERR "kobject: can not set name properly!\n"); return retval; } kobj->parent = parent; return kobject_add_internal(kobj); //主要實(shí)現(xiàn)函數(shù)}static int kobject_add_internal(struct kobject *kobj){ int error = 0; struct kobject *parent; if (!kobj) return -ENOENT; if (!kobj->name || !kobj->name[0]) { WARN(1, "kobject: (%p): attempted to be registered with empty " "name!\n", kobj); return -EINVAL; } parent = kobject_get(kobj->parent); //增加父對象的引用計(jì)數(shù) /* join kset if set, use it as parent if we do not already have one */ if (kobj->kset) { //如果設(shè)置了kset,而沒有設(shè)置parent,則把kset的kobject設(shè)置為parent if (!parent) parent = kobject_get(&kobj->kset->kobj); kobj_kset_join(kobj); kobj->parent = parent; } pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n", kobject_name(kobj), kobj, __func__, parent ? kobject_name(parent) : "<NULL>", kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>"); error = create_dir(kobj); //創(chuàng)建sysfs對應(yīng)的目錄和屬性文件 if (error) { //出錯(cuò)回滾 kobj_kset_leave(kobj); kobject_put(parent); kobj->parent = NULL; /* be noisy on error issues */ if (error == -EEXIST) WARN(1, "%s failed for %s with " "-EEXIST, don't try to register things with " "the same name in the same directory.\n", __func__, kobject_name(kobj)); else WARN(1, "%s failed for %s (error: %d parent: %s)\n", __func__, kobject_name(kobj), error, parent ? kobject_name(parent) : "'none'"); } else kobj->state_in_sysfs = 1; //更新標(biāo)志位 return error;}
由create_dir在sysfs創(chuàng)建真實(shí)的目錄和文件,這點(diǎn)有下一篇sysfs詳細(xì)描述。理解了kobject_init和kobject_add之后,由名字可以知道下面函數(shù)kobject_init_and_add和kobject_create_and_add
調(diào)用kobject_del將對kobject釋放:
/** * kobject_del - unlink kobject from hierarchy. * @kobj: object. */void kobject_del(struct kobject *kobj){ struct kernfs_node *sd; if (!kobj) return; sd = kobj->sd; sysfs_remove_dir(kobj); //刪除kobject在sysfs中的目錄 sysfs_put(sd); kobj->state_in_sysfs = 0; //設(shè)置標(biāo)志位 kobj_kset_leave(kobj); //kobject脫離kset鏈表 kobject_put(kobj->parent); //調(diào)用kobject_release釋放 kobj->parent = NULL;}EXPORT_SYMBOL(kobject_del);/** * kobject_put - decrement refcount for object. * @kobj: object. * * Decrement the refcount, and if 0, call kobject_cleanup(). */void kobject_put(struct kobject *kobj){ if (kobj) { if (!kobj->state_initialized) WARN(1, KERN_WARNING "kobject: '%s' (%p): is not " "initialized, yet kobject_put() is being " "called.\n", kobject_name(kobj), kobj); kref_put(&kobj->kref, kobject_release); //調(diào)用kobject_release }}EXPORT_SYMBOL(kobject_put);static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)){ return kref_sub(kref, 1, release);}static inline int kref_sub(struct kref *kref, unsigned int count, void (*release)(struct kref *kref)){ WARN_ON(release == NULL); if (atomic_sub_and_test((int) count, &kref->refcount)) { release(kref); //調(diào)用kobject_release return 1; } return 0;}
根據(jù)上面的代碼追蹤,得知kobject_release才是釋放kobject的主角:
static void kobject_release(struct kref *kref){ struct kobject *kobj = container_of(kref, struct kobject, kref);#ifdef CONFIG_DEBUG_KOBJECT_RELEASE unsigned long delay = HZ + HZ * (get_random_int() & 0x3); pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n", kobject_name(kobj), kobj, __func__, kobj->parent, delay); INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup); //延遲調(diào)用kobject_delayed_cleanup進(jìn)行清理 schedule_delayed_work(&kobj->release, delay);#else kobject_cleanup(kobj); //清理#endif}
如果在內(nèi)核編譯時(shí)指定CONFIG_DEBUG_KOBJECT_RELEASE,則使用延遲release方式調(diào)用kobject_delayed_cleanup,否則直接調(diào)用kobject_cleanup。
#ifdef CONFIG_DEBUG_KOBJECT_RELEASEstatic void kobject_delayed_cleanup(struct work_struct *work){ kobject_cleanup(container_of(to_delayed_work(work), //最終還是調(diào)用 struct kobject, release));}#endif/* * kobject_cleanup - free kobject resources. * @kobj: object to cleanup */static void kobject_cleanup(struct kobject *kobj){ struct kobj_type *t = get_ktype(kobj); const char *name = kobj->name; pr_debug("kobject: '%s' (%p): %s, parent %p\n", kobject_name(kobj), kobj, __func__, kobj->parent); if (t && !t->release) pr_debug("kobject: '%s' (%p): does not have a release() " "function, it is broken and must be fixed.\n", kobject_name(kobj), kobj); /* send "remove" if the caller did not do it but sent "add" */ if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) { pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n", kobject_name(kobj), kobj); kobject_uevent(kobj, KOBJ_REMOVE); //僅僅發(fā)送一次REMOVE消息 } /* remove from sysfs if the caller did not do it */ if (kobj->state_in_sysfs) { pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n", kobject_name(kobj), kobj); kobject_del(kobj); //如果調(diào)用者沒有清理sysfs,則清理 } if (t && t->release) { pr_debug("kobject: '%s' (%p): calling ktype release\n", kobject_name(kobj), kobj); t->release(kobj); //調(diào)用kobj_type的release回調(diào)函數(shù) } /* free name if we allocated it */ if (name) { pr_debug("kobject: '%s': free name\n", name); kfree_const(name); }}
void kset_init(struct kset *kset);struct kset *kset_create(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj);int kset_register(struct kset *kset);void kset_unregister(struct kset *kset);struct kset * kset_create_and_add(const char *name, const struct kset_uevent_ops *u, struct kobject *parent_kobj);
內(nèi)核對象集由kset_create創(chuàng)建
/** * kset_create - create a struct kset dynamically * * @name: the name for the kset * @uevent_ops: a struct kset_uevent_ops for the kset * @parent_kobj: the parent kobject of this kset, if any. * * This function creates a kset structure dynamically. This structure can * then be registered with the system and show up in sysfs with a call to * kset_register(). When you are finished with this structure, if * kset_register() has been called, call kset_unregister() and the * structure will be dynamically freed when it is no longer being used. * * If the kset was not able to be created, NULL will be returned. */static struct kset *kset_create(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj){ struct kset *kset; int retval; kset = kzalloc(sizeof(*kset), GFP_KERNEL); //分配空間 if (!kset) return NULL; retval = kobject_set_name(&kset->kobj, "%s", name); //設(shè)置kset在sysfs中的名字 if (retval) { kfree(kset); return NULL; } kset->uevent_ops = uevent_ops; //設(shè)置uevent_ops kset->kobj.parent = parent_kobj; //設(shè)置kset的父對象 /* * The kobject of this kset will have a type of kset_ktype and belong to * no kset itself. That way we can properly free it when it is * finished being used. */ kset->kobj.ktype = &kset_ktype; //設(shè)置kobj_type kset->kobj.kset = NULL; return kset;}
內(nèi)核對象集由kset_init執(zhí)行初始化:
/** * kset_init - initialize a kset for use * @k: kset */void kset_init(struct kset *k){ kobject_init_internal(&k->kobj); //這里初始化 INIT_LIST_HEAD(&k->list); spin_lock_init(&k->list_lock);}static void kobject_init_internal(struct kobject *kobj){ if (!kobj) return; kref_init(&kobj->kref); INIT_LIST_HEAD(&kobj->entry); kobj->state_in_sysfs = 0; //設(shè)置對應(yīng)標(biāo)志位 kobj->state_add_uevent_sent = 0; kobj->state_remove_uevent_sent = 0; kobj->state_initialized = 1;}
初始化kset之后,調(diào)用kset_register,將kset添加到sysfs:
/** * kset_register - initialize and add a kset. * @k: kset. */int kset_register(struct kset *k){ int err; if (!k) return -EINVAL; kset_init(k); err = kobject_add_internal(&k->kobj); //完成register動作,前面已說明 if (err) return err; kobject_uevent(&k->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間 return 0;}EXPORT_SYMBOL(kset_register);
經(jīng)過kset_create, kset_init和kset_register之后,kset已初始化并添加完成。當(dāng)然kset_create_and_add包含了這三個(gè)函數(shù)。
內(nèi)核對象的釋放過程與kobject的釋放過程類似,由kset_unregister完成:
/** * kset_unregister - remove a kset. * @k: kset. */void kset_unregister(struct kset *k){ if (!k) return; kobject_del(&k->kobj); //刪除sysfs的目錄和屬性文件,前面已說明 kobject_put(&k->kobj); //與kobject釋放過程一致}EXPORT_SYMBOL(kset_unregister);
由前面的代碼可以看到無論kobject或是kset,都會向用戶空間發(fā)送事件,由kobject_uevent函數(shù)通過設(shè)置環(huán)境變量的方式完成:
struct kobj_uevent_env { char *argv[3]; //user_helper使用的命令 char *envp[UEVENT_NUM_ENVP]; //環(huán)境變量數(shù)組 int envp_idx; //當(dāng)前環(huán)境變量索引 char buf[UEVENT_BUFFER_SIZE]; //環(huán)境變量數(shù)據(jù)緩沖區(qū) int buflen;};/** * kobject_uevent - notify userspace by sending an uevent * * @action: action that is happening * @kobj: struct kobject that the action is happening to * * Returns 0 if kobject_uevent() is completed with success or the * corresponding error when it fails. */int kobject_uevent(struct kobject *kobj, enum kobject_action action){ return kobject_uevent_env(kobj, action, NULL); //實(shí)際完成發(fā)送函數(shù)}EXPORT_SYMBOL_GPL(kobject_uevent);/** * kobject_uevent_env - send an uevent with environmental data * * @action: action that is happening * @kobj: struct kobject that the action is happening to * @envp_ext: pointer to environmental data * * Returns 0 if kobject_uevent_env() is completed with success or the * corresponding error when it fails. */int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp_ext[]){ struct kobj_uevent_env *env; const char *action_string = kobject_actions[action]; const char *devpath = NULL; const char *subsystem; struct kobject *top_kobj; struct kset *kset; const struct kset_uevent_ops *uevent_ops; int i = 0; int retval = 0;#ifdef CONFIG_NET struct uevent_sock *ue_sk;#endif pr_debug("kobject: '%s' (%p): %s\n", kobject_name(kobj), kobj, __func__); /* search the kset we belong to */ top_kobj = kobj; while (!top_kobj->kset && top_kobj->parent) //尋找最近的kset,kset中有鍀event_ops top_kobj = top_kobj->parent; if (!top_kobj->kset) { pr_debug("kobject: '%s' (%p): %s: attempted to send uevent " "without kset!\n", kobject_name(kobj), kobj, __func__); return -EINVAL; } kset = top_kobj->kset; uevent_ops = kset->uevent_ops; //使用kset中的uevent_ops執(zhí)行發(fā)送操作 /* skip the event, if uevent_suppress is set*/ if (kobj->uevent_suppress) { //跳過設(shè)置為uevent_suppress的kobject pr_debug("kobject: '%s' (%p): %s: uevent_suppress " "caused the event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* skip the event, if the filter returns zero. */ if (uevent_ops && uevent_ops->filter) //調(diào)用uevent_ops的filter函數(shù) if (!uevent_ops->filter(kset, kobj)) { pr_debug("kobject: '%s' (%p): %s: filter function " "caused the event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* originating subsystem */ if (uevent_ops && uevent_ops->name) //確定發(fā)送事件的kobject名字 subsystem = uevent_ops->name(kset, kobj); else subsystem = kobject_name(&kset->kobj); if (!subsystem) { pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the " "event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* environment buffer */ env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); //分配kobj_uevent_env if (!env) return -ENOMEM; /* complete object path */ devpath = kobject_get_path(kobj, GFP_KERNEL); if (!devpath) { retval = -ENOENT; goto exit; } /* default keys 添加環(huán)境變量 */ retval = add_uevent_var(env, "ACTION=%s", action_string); if (retval) goto exit; retval = add_uevent_var(env, "DEVPATH=%s", devpath); if (retval) goto exit; retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem); if (retval) goto exit; /* keys passed in from the caller */ if (envp_ext) { for (i = 0; envp_ext[i]; i++) { retval = add_uevent_var(env, "%s", envp_ext[i]); if (retval) goto exit; } } /* let the kset specific function add its stuff */ if (uevent_ops && uevent_ops->uevent) { //調(diào)用uevent回調(diào)函數(shù),添加子系統(tǒng)特定的環(huán)境變量 retval = uevent_ops->uevent(kset, kobj, env); if (retval) { pr_debug("kobject: '%s' (%p): %s: uevent() returned " "%d\n", kobject_name(kobj), kobj, __func__, retval); goto exit; } } /* * Mark "add" and "remove" events in the object to ensure proper * events to userspace during automatic cleanup. If the object did * send an "add" event, "remove" will automatically generated by * the core, if not already done by the caller. */ if (action == KOBJ_ADD) kobj->state_add_uevent_sent = 1; else if (action == KOBJ_REMOVE) kobj->state_remove_uevent_sent = 1; mutex_lock(&uevent_sock_mutex); /* we will send an event, so request a new sequence number */ retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum); if (retval) { mutex_unlock(&uevent_sock_mutex); goto exit; }#if defined(CONFIG_NET) //如果在編譯時(shí)指定CONFIG_NET,使用netlink發(fā)送 /* send netlink message */ list_for_each_entry(ue_sk, &uevent_sock_list, list) { struct sock *uevent_sock = ue_sk->sk; struct sk_buff *skb; size_t len; if (!netlink_has_listeners(uevent_sock, 1)) continue; /* allocate message with the maximum possible size */ len = strlen(action_string) + strlen(devpath) + 2; skb = alloc_skb(len + env->buflen, GFP_KERNEL); if (skb) { char *scratch; /* add header */ scratch = skb_put(skb, len); sprintf(scratch, "%s@%s", action_string, devpath); /* copy keys to our continuous event payload buffer */ for (i = 0; i < env->envp_idx; i++) { len = strlen(env->envp[i]) + 1; scratch = skb_put(skb, len); strcpy(scratch, env->envp[i]); } NETLINK_CB(skb).dst_group = 1; retval = netlink_broadcast_filtered(uevent_sock, skb, //使用netlink多播發(fā)送 0, 1, GFP_KERNEL, kobj_bcast_filter, kobj); /* ENOBUFS should be handled in userspace */ if (retval == -ENOBUFS || retval == -ESRCH) retval = 0; } else retval = -ENOMEM; }#endif mutex_unlock(&uevent_sock_mutex);#ifdef CONFIG_UEVENT_HELPER //不能使用netlink時(shí),使用user_helper發(fā)送 /* call uevent_helper, usually only enabled during early boot */ if (uevent_helper[0] && !kobj_usermode_filter(kobj)) { struct subprocess_info *info; retval = add_uevent_var(env, "HOME=/"); if (retval) goto exit; retval = add_uevent_var(env, "PATH=/sbin:/bin:/usr/sbin:/usr/bin"); if (retval) goto exit; retval = init_uevent_argv(env, subsystem); //組裝需要調(diào)用的用戶空間命令和參數(shù) if (retval) goto exit; retval = -ENOMEM; info = call_usermodehelper_setup(env->argv[0], env->argv, //調(diào)用用戶空間程序/sbin/hotplug env->envp, GFP_KERNEL, NULL, cleanup_uevent_env, env); if (info) { retval = call_usermodehelper_exec(info, UMH_NO_WAIT); env = NULL; /* freed by cleanup_uevent_env */ } }#endifexit: kfree(devpath); kfree(env); return retval;}EXPORT_SYMBOL_GPL(kobject_uevent_env);
本篇文章不是以文件系統(tǒng)的角度來詳細(xì)描述sysfs,而是從內(nèi)核對象如何通過sysfs表示整個(gè)設(shè)備驅(qū)動模型為切入點(diǎn),進(jìn)一步理解Linux內(nèi)核對象。
在上文《內(nèi)核對象與對象集》中,將kobject添加到sysfs中,kobject_add –> kobject_add_varg –> kobject_add_internal,調(diào)用create_dir創(chuàng)建sysfs目錄和屬性文件。
static int create_dir(struct kobject *kobj){ const struct kobj_ns_type_operations *ops; int error; //調(diào)用sysfs接口創(chuàng)建kobject對應(yīng)的目錄 error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj)); if (error) return error; error = populate_dir(kobj); //在kobject對應(yīng)的目錄中生成默認(rèn)屬性文件 if (error) { sysfs_remove_dir(kobj); return error; } /* * @kobj->sd may be deleted by an ancestor going away. Hold an * extra reference so that it stays until @kobj is gone. */ sysfs_get(kobj->sd); /* * If @kobj has ns_ops, its children need to be filtered based on * their namespace tags. Enable namespace support on @kobj->sd. */ ops = kobj_child_ns_ops(kobj); if (ops) { BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE); BUG_ON(ops->type >= KOBJ_NS_TYPES); BUG_ON(!kobj_ns_type_registered(ops->type)); sysfs_enable_ns(kobj->sd); } return 0;}/* * populate_dir - populate directory with attributes. * @kobj: object we're working on. * * Most subsystems have a set of default attributes that are associated * with an object that registers with them. This is a helper called during * object registration that loops through the default attributes of the * subsystem and creates attributes files for them in sysfs. */static int populate_dir(struct kobject *kobj){ struct kobj_type *t = get_ktype(kobj); struct attribute *attr; int error = 0; int i; if (t && t->default_attrs) { for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) { error = sysfs_create_file(kobj, attr); //為每個(gè)屬性創(chuàng)建對應(yīng)的文件 if (error) break; } } return error;}
create_dir通過調(diào)用sysfs_create_dir_ns創(chuàng)建sysfs中的目錄,調(diào)用sysfs_create_file創(chuàng)建屬性文件。
kern_node代表sysfs中每個(gè)節(jié)點(diǎn)。
/* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are * private to kernfs and shouldn't be accessed directly by kernfs users. * * As long as s_count reference is held, the kernfs_node itself is * accessible. Dereferencing elem or any other outer entity requires * active reference. */struct kernfs_node { atomic_t count; //引用計(jì)數(shù) atomic_t active; //活動的引用計(jì)數(shù)#ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map;#endif /* * Use kernfs_get_parent() and kernfs_name/path() instead of * accessing the following two fields directly. If the node is * never moved to a different parent, it is safe to access the * parent directly. */ struct kernfs_node *parent; //指向父節(jié)點(diǎn) const char *name; //節(jié)點(diǎn)名稱,在sysfs顯示的名字 struct rb_node rb; //接入sysfs紅黑樹的鏈接項(xiàng) const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash 紅黑樹key */ union { struct kernfs_elem_dir dir; //該kern_node類型為目錄 struct kernfs_elem_symlink symlink; //該kern_node類型為鏈接 struct kernfs_elem_attr attr; //該kern_node類型為屬性文件 }; void *priv; unsigned short flags; //標(biāo)記位,目錄、鏈接、屬性文件或是否已被刪除 umode_t mode; //訪問權(quán)限,在sysfs中該kern_node的權(quán)限 unsigned int ino; //唯一編號 struct kernfs_iattrs *iattr; //用于設(shè)置非默認(rèn)的inode屬性,如果沒有則置為NULL};
/** * sysfs_create_dir_ns - create a directory for an object with a namespace tag * @kobj: object we're creating directory for * @ns: the namespace tag to use */int sysfs_create_dir_ns(struct kobject *kobj, const void *ns){ struct kernfs_node *parent, *kn; BUG_ON(!kobj); if (kobj->parent) parent = kobj->parent->sd; //如果kobject設(shè)置parent,則使用之 else parent = sysfs_root_kn; //否則parent就設(shè)置為sysfs根目錄 if (!parent) return -ENOENT; //創(chuàng)建目錄 kn = kernfs_create_dir_ns(parent, kobject_name(kobj), S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, kobject_name(kobj)); return PTR_ERR(kn); } kobj->sd = kn; return 0;}/** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * * Returns the created node on success, ERR_PTR() value on failure. */struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, void *priv, const void *ns){ struct kernfs_node *kn; int rc; /* allocate 分配空間并初始化, KERNFS_DIR指定創(chuàng)建目錄 */ kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->dir.root = parent->dir.root; //指向根目錄kern_node kn->ns = ns; //指定命名空間 kn->priv = priv; /* link in */ rc = kernfs_add_one(kn); //將kern_node加入父目錄的紅黑樹中 if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc);}
kernfs_create_dir_ns函數(shù)中的兩個(gè)主要函數(shù)kernfs_new_node和kernfs_add_one,在創(chuàng)建文件和創(chuàng)建符號鏈接同樣使用,僅是參數(shù)不同。
為kern_node結(jié)構(gòu)分配空間,并初始化
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, unsigned flags){ struct kernfs_node *kn; //分配kern_node空間,并初始化 kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags); if (kn) { kernfs_get(parent); kn->parent = parent; } return kn;}static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, const char *name, umode_t mode, unsigned flags){ struct kernfs_node *kn; int ret; name = kstrdup_const(name, GFP_KERNEL); //復(fù)制常量字符串 if (!name) return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); //在緩存空間分配kernfs_node if (!kn) goto err_out1; /* * If the ino of the sysfs entry created for a kmem cache gets * allocated from an ida layer, which is accounted to the memcg that * owns the cache, the memcg will get pinned forever. So do not account * ino ida allocations. */ ret = ida_simple_get(&root->ino_ida, 1, 0, //獲取唯一標(biāo)號,用于唯一標(biāo)示kern_node GFP_KERNEL | __GFP_NOACCOUNT); if (ret < 0) goto err_out2; kn->ino = ret; atomic_set(&kn->count, 1); //更新引用計(jì)數(shù) atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); //設(shè)置kern_node相關(guān)域 kn->name = name; kn->mode = mode; kn->flags = flags; return kn; err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: kfree_const(name); return NULL;}
將kern_node添加到parent的紅黑樹中:
/** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * * RETURNS: * 0 on success, -EEXIST if entry with the given name already * exists. */int kernfs_add_one(struct kernfs_node *kn){ struct kernfs_node *parent = kn->parent; struct kernfs_iattrs *ps_iattr; bool has_ns; int ret; mutex_lock(&kernfs_mutex); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", parent->name, kn->name)) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) //檢查parent是否為目錄 goto out_unlock; ret = -ENOENT; if (parent->flags & KERNFS_EMPTY_DIR) //檢查parent是否為空目錄 goto out_unlock; //檢查parent是否是active狀態(tài) if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); //作為紅黑樹比較的key ret = kernfs_link_sibling(kn); //kern_node鏈入parent節(jié)點(diǎn)紅黑樹中 if (ret) goto out_unlock; /* Update timestamps on the parent */ ps_iattr = parent->iattr; if (ps_iattr) { struct iattr *ps_iattrs = &ps_iattr->ia_iattr; ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; } mutex_unlock(&kernfs_mutex); /* * Activate the new node unless CREATE_DEACTIVATED is requested. * If not activated here, the kernfs user is responsible for * activating the node with kernfs_activate(). A node which hasn't * been activated is not visible to userland and its removal won't * trigger deactivation. */ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return 0;out_unlock: mutex_unlock(&kernfs_mutex); return ret;}
sysfs紅黑樹中的key:
/** * kernfs_name_hash * @name: Null terminated string to hash * @ns: Namespace tag to hash * * Returns 31 bit hash of ns + name (so it fits in an off_t ) */static unsigned int kernfs_name_hash(const char *name, const void *ns){ unsigned long hash = init_name_hash(); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; return hash;}static int kernfs_name_compare(unsigned int hash, const char *name, const void *ns, const struct kernfs_node *kn){ if (hash < kn->hash) return -1; if (hash > kn->hash) return 1; if (ns < kn->ns) return -1; if (ns > kn->ns) return 1; return strcmp(name, kn->name);}
kernfs_name_hash: 根據(jù)name和ns計(jì)算kern_node的hash值,保存在kern_node.hash域中。
kernfs_name_compare: sysfs紅黑樹key的比較函數(shù), 比較優(yōu)先級是: hash > ns > name
kern_node鏈入parent節(jié)點(diǎn)紅黑樹中:
/** * kernfs_link_sibling - link kernfs_node into sibling rbtree * @kn: kernfs_node of interest * * Link @kn into its sibling rbtree which starts from * @kn->parent->dir.children. * * Locking: * mutex_lock(kernfs_mutex) * * RETURNS: * 0 on susccess -EEXIST on failure. */static int kernfs_link_sibling(struct kernfs_node *kn){ struct rb_node **node = &kn->parent->dir.children.rb_node; //parent目錄的紅黑樹 struct rb_node *parent = NULL; while (*node) { //在parent的目錄中,尋找合適的位置將kn插入parent的紅黑樹中 struct kernfs_node *pos; int result; pos = rb_to_kn(*node); parent = *node; result = kernfs_sd_compare(kn, pos); //優(yōu)先順序: hash > ns > name if (result < 0) node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right; else return -EEXIST; } /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); /* successfully added, account subdir number */ if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; return 0;}
static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr){ return sysfs_create_file_ns(kobj, attr, NULL);}/** * sysfs_create_file_ns - create an attribute file for an object with custom ns * @kobj: object we're creating for * @attr: attribute descriptor * @ns: namespace the new file should belong to */int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns){ BUG_ON(!kobj || !kobj->sd || !attr); return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);}EXPORT_SYMBOL_GPL(sysfs_create_file_ns);int sysfs_add_file_mode_ns(struct kernfs_node *parent, const struct attribute *attr, bool is_bin, umode_t mode, const void *ns){ struct lock_class_key *key = NULL; const struct kernfs_ops *ops; struct kernfs_node *kn; loff_t size; if (!is_bin) { struct kobject *kobj = parent->priv; const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; /* every kobject with an attribute needs a ktype assigned */ if (WARN(!sysfs_ops, KERN_ERR "missing sysfs attribute operations for kobject: %s\n", kobject_name(kobj))) return -EINVAL; //確定讀寫的操作函數(shù) if (sysfs_ops->show && sysfs_ops->store) { if (mode & SYSFS_PREALLOC) ops = &sysfs_prealloc_kfops_rw; else ops = &sysfs_file_kfops_rw; } else if (sysfs_ops->show) { if (mode & SYSFS_PREALLOC) ops = &sysfs_prealloc_kfops_ro; else ops = &sysfs_file_kfops_ro; } else if (sysfs_ops->store) { if (mode & SYSFS_PREALLOC) ops = &sysfs_prealloc_kfops_wo; else ops = &sysfs_file_kfops_wo; } else ops = &sysfs_file_kfops_empty; size = PAGE_SIZE; } else { struct bin_attribute *battr = (void *)attr; if (battr->mmap) ops = &sysfs_bin_kfops_mmap; else if (battr->read && battr->write) ops = &sysfs_bin_kfops_rw; else if (battr->read) ops = &sysfs_bin_kfops_ro; else if (battr->write) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; size = battr->size; }#ifdef CONFIG_DEBUG_LOCK_ALLOC if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey;#endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops, (void *)attr, ns, key); //創(chuàng)建屬性文件 if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); return PTR_ERR(kn); } return 0;}
通過上面的代碼跟蹤,創(chuàng)建屬性文件由__kernfs_create_file實(shí)現(xiàn),最終仍然是調(diào)用kernfs_new_node和kernfs_add_one。
/** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Returns the created node on success, ERR_PTR() value on error. */struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key){ struct kernfs_node *kn; unsigned flags; int rc; flags = KERNFS_FILE; //創(chuàng)建的kern_node類型為file //分配空間并初始化 kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); if (!kn) return ERR_PTR(-ENOMEM); kn->attr.ops = ops; kn->attr.size = size; kn->ns = ns; kn->priv = priv;#ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { lockdep_init_map(&kn->dep_map, "s_active", key, 0); kn->flags |= KERNFS_LOCKDEP; }#endif /* * kn->attr.ops is accesible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ if (ops->seq_show) kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; rc = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中 if (rc) { kernfs_put(kn); return ERR_PTR(rc); } return kn;}
在sysfs_add_file_mode_ns函數(shù)中根據(jù)flags的不同,注冊不同的讀寫回調(diào)函數(shù),下面以sysfs_prealloc_kfops_rw函數(shù)為例,其他結(jié)構(gòu)類似,不贅述。
//常規(guī)文件--sysfs_prealloc_kfops_rwstatic const struct kernfs_ops sysfs_prealloc_kfops_rw = { .read = sysfs_kf_read, .write = sysfs_kf_write, .prealloc = true,};/* kernfs read callback for regular sysfs files with pre-alloc */static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos){ const struct sysfs_ops *ops = sysfs_file_ops(of->kn); //獲取kobject中的sysfs_ops操作表 struct kobject *kobj = of->kn->parent->priv; size_t len; /* * If buf != of->prealloc_buf, we don't know how * large it is, so cannot safely pass it to ->show */ if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); //kobject中sd域的sysfs_ops操作表中的show return min(count, len);}/* kernfs write callback for regular sysfs files */static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos){ //獲取kobject中的sysfs_ops操作表 const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = of->kn->parent->priv; if (!count) return 0; return ops->store(kobj, of->kn->priv, buf, count); //kobject中sd域的sysfs_ops操作表中的store}
關(guān)于屬性文件的讀寫操作,最終都回調(diào)到kobject中的sd域的sysfs_ops操作表,這個(gè)操作表示在kobject_init函數(shù)中設(shè)置?;仡檏object_create函數(shù):
struct kobject *kobject_create(void){ struct kobject *kobj; kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); //分配空間 if (!kobj) return NULL; kobject_init(kobj, &dynamic_kobj_ktype); //初始化, kobj_type類型為dynamic_kobj_ktype return kobj;}//注冊如下結(jié)構(gòu)static struct kobj_type dynamic_kobj_ktype = { .release = dynamic_kobj_release, .sysfs_ops = &kobj_sysfs_ops,};const struct sysfs_ops kobj_sysfs_ops = { .show = kobj_attr_show, .store = kobj_attr_store,};EXPORT_SYMBOL_GPL(kobj_sysfs_ops);
kobject的sysfs的show和store方法為:kobj_attr_show和kobj_attr_store
static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr, char *buf){ struct kobj_attribute *kattr; ssize_t ret = -EIO; kattr = container_of(attr, struct kobj_attribute, attr); if (kattr->show) //如果業(yè)務(wù)子系統(tǒng)設(shè)置了show函數(shù),則調(diào)用 ret = kattr->show(kobj, kattr, buf); return ret;}static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count){ struct kobj_attribute *kattr; ssize_t ret = -EIO; kattr = container_of(attr, struct kobj_attribute, attr); if (kattr->store) //如果業(yè)務(wù)子系統(tǒng)設(shè)置了store函數(shù),則調(diào)用 ret = kattr->store(kobj, kattr, buf, count); return ret;}
真正的對屬性文件進(jìn)行讀寫的回調(diào)由業(yè)務(wù)子系統(tǒng)實(shí)現(xiàn)。
/** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name){ return sysfs_do_create_link(kobj, target, name, 1);}EXPORT_SYMBOL_GPL(sysfs_create_link);static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, const char *name, int warn){ struct kernfs_node *parent = NULL; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; if (!parent) return -EFAULT; return sysfs_do_create_link_sd(parent, target, name, warn);}static int sysfs_do_create_link_sd(struct kernfs_node *parent, struct kobject *target_kobj, const char *name, int warn){ struct kernfs_node *kn, *target = NULL; BUG_ON(!name || !parent); /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); if (target_kobj->sd) { target = target_kobj->sd; kernfs_get(target); } spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; kn = kernfs_create_link(parent, name, target); //創(chuàng)建sysfs符號鏈接 kernfs_put(target); if (!IS_ERR(kn)) return 0; if (warn && PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, name); return PTR_ERR(kn);}
由上面的代碼追蹤,創(chuàng)建符號鏈接由kernfs_create_link函數(shù)上。
/** * kernfs_create_link - create a symlink * @parent: directory to create the symlink in * @name: name of the symlink * @target: target node for the symlink to point to * * Returns the created node on success, ERR_PTR() value on error. */struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target){ struct kernfs_node *kn; int error; //指定創(chuàng)建符號鏈接 kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); if (kernfs_ns_enabled(parent)) kn->ns = target->ns; kn->symlink.target_kn = target; kernfs_get(target); /* ref owned by symlink */ error = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中 if (!error) return kn; kernfs_put(kn); return ERR_PTR(error);}
與創(chuàng)建目錄和文件類似,最終仍然是調(diào)用kernfs_new_node和kernfs_add_one實(shí)現(xiàn)。
目標(biāo):在sysfs中創(chuàng)建一個(gè)目錄/sys/kernel/storage/,在該目錄下,還創(chuàng)建了一個(gè)文件value。value可以寫入整型數(shù)據(jù),隨后可以讀出。
* 定義內(nèi)核對象
struct storage_obj { struct kobject kobj; int val; //用于保存寫入的數(shù)據(jù)};
定義屬性類型
struct storage_attribute { struct attribute *attr; ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t);}
聲明屬性
定義屬性的show和store方法,如下:
//定義并初始化storage_attributestruct storage_attribute *sattr = &struct storage_attribute { .attr = {.name = "value", .mode = 0666}, .show = storage_show, .store = storage_store,};
實(shí)現(xiàn)sysfs操作
ssize_t storage_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct storage *stor = container_of(kobj, struct storage_obj, kobj); stor->val = atoi(buf);}ssize_t storage_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t s) { struct storage *stor = container_of(kobj, struct storage_obj, kobj); memcpy(buf, s, itoa(stor->val));}
定義內(nèi)核對象release方法
release方法設(shè)置在kobj_type結(jié)構(gòu)中
void storage_release(struct kobject *kobj){ ......}
聲明內(nèi)核對象類型
struct storage_ktype { struct kobj_type *ktype;}
封裝對象屬性添加和刪除方法
需要將value屬性添加到內(nèi)核對象,或者從內(nèi)核對象刪除,可以直接調(diào)用sysfs_create_file和sysfs_remove_file。但大多數(shù)情況下,會對這兩個(gè)方法做一層封裝:storage_create_file和storage_remove_file。
int storage_create_file(struct storage_obj *sobj, const struct storage_attribute *attr){ int error = 0; if (sobj) { error = sysfs_create_file(&sobj->kobj, &attr->attr); } return error;}void storage_remove_file(struct storage_obj *sobj, const struct storage_attribute *attr){ if (sobj) { sysfs_remove_file(&sobj->kobj, &attr->attr); }}
定義對象的創(chuàng)建和銷毀方法
struct storage_obj * create_storage_obj() { struct storage_obj *sobj = (struct storage_obj *)malloc(struct storage_obj); struct storage_ktype *stype = (struct storage_ktype *)malloc(struct storage_ktype); sobj->parent = kernel_kobj; kobject_init_and_add(&sobj->kobj, &stype->ktype); return sobj}void destroy_storage_obj(struct kobject *kobj) { struct storage_obj *sobj = container_of(kobj, struct storage_obj, kobj); kobject_del(kboj); free(sobj); free(stype);}
實(shí)現(xiàn)模塊加載和卸載方法
加載時(shí)調(diào)用create_storage_obj, 卸載時(shí)調(diào)用destroy_storage_obj
Linux的設(shè)備驅(qū)動模型能夠帶來以下的優(yōu)點(diǎn):
* 使用統(tǒng)一機(jī)制來表達(dá)設(shè)備與驅(qū)動之間的關(guān)系,規(guī)范設(shè)備驅(qū)動的編寫,核心代碼復(fù)用。
* 將系統(tǒng)中的設(shè)備以樹結(jié)構(gòu)組織,并且通過sysfs將其呈現(xiàn)在用戶空間——包括所有的總線和內(nèi)部連接。
* 支持設(shè)備的熱拔插機(jī)制。
* 支持通用的電源管理機(jī)制,通過由葉子節(jié)點(diǎn)到根節(jié)點(diǎn)的方向遍歷設(shè)備樹,確保子設(shè)備在父設(shè)備之前斷電。
內(nèi)核基于內(nèi)核對象和sysfs,通過抽象以下五種概念,實(shí)現(xiàn)了設(shè)備驅(qū)動模型的框架,使得編寫子系統(tǒng)成為“八股文”。
1. bus_type: 總線類型,每個(gè)子系統(tǒng)有且只有一個(gè)總線類型,由bus_type和subsys_private兩個(gè)結(jié)構(gòu)共同描述。
2. device: 設(shè)備,描述掛在總線類型中的設(shè)備,由device和device_private兩個(gè)結(jié)構(gòu)共同描述。
3. driver: 驅(qū)動, 描述掛在總線類型中的驅(qū)動模塊,由device_driver和driver_private兩個(gè)結(jié)構(gòu)共同描述。
4. class: 類,每個(gè)總線類型有且只有一個(gè)類,由class和subsys_private兩個(gè)結(jié)構(gòu)共同描述。
5. class_interface: 接口,每個(gè)類有多個(gè)接口,由class_interface結(jié)構(gòu)描述。
在Linux內(nèi)核中,子系統(tǒng)是由bus_type, device, driver, class和class_interface之間的關(guān)系所描述,而設(shè)備驅(qū)動模型正是這些關(guān)系的核心實(shí)現(xiàn),使得在編寫子系統(tǒng)程序時(shí),只要遵循設(shè)備模型的套路,便不需要關(guān)注于這些復(fù)雜的關(guān)系,只需實(shí)現(xiàn)自身的業(yè)務(wù)邏輯。
每個(gè)子系統(tǒng)都有一個(gè)總線類型,總線類型擁有一個(gè)設(shè)備鏈表和一個(gè)驅(qū)動鏈表,用于連接由該總線類型已發(fā)現(xiàn)的設(shè)備和已加載的驅(qū)動,設(shè)備發(fā)現(xiàn)和驅(qū)動加載的順序是任意的。每個(gè)設(shè)備最多綁定到一個(gè)驅(qū)動,被綁定了驅(qū)動的設(shè)備可以正常工作。除此之外,每個(gè)設(shè)備可以唯一屬于某個(gè)類,類中包含多個(gè)接口,接口的方法作用于設(shè)備,不管是先添加接口,還是先發(fā)現(xiàn)設(shè)備。
struct bus_type { const char *name; //子系統(tǒng)名稱 const char *dev_name; //供子系統(tǒng)生成設(shè)備名稱使用 struct device *dev_root; struct device_attribute *dev_attrs; /* use dev_groups instead */ const struct attribute_group **bus_groups; //總線類型使用的屬性組 const struct attribute_group **dev_groups; //設(shè)備使用的屬性組 const struct attribute_group **drv_groups; //驅(qū)動使用的屬性組 int (*match)(struct device *dev, struct device_driver *drv); //檢測設(shè)備與驅(qū)動是否可以綁定 int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置bus特有的環(huán)境變量 int (*probe)(struct device *dev); //當(dāng)設(shè)備可以綁定到驅(qū)動時(shí),對設(shè)備進(jìn)行初始化和執(zhí)行綁定 int (*remove)(struct device *dev); //當(dāng)設(shè)備從驅(qū)動中解綁時(shí),回調(diào) void (*shutdown)(struct device *dev); //當(dāng)設(shè)備斷電時(shí),回調(diào) int (*online)(struct device *dev); //當(dāng)設(shè)備上電時(shí),回調(diào) int (*offline)(struct device *dev); //當(dāng)設(shè)備下電時(shí),回調(diào) int (*suspend)(struct device *dev, pm_message_t state); //當(dāng)設(shè)備進(jìn)入節(jié)能狀態(tài)時(shí),回調(diào) int (*resume)(struct device *dev); //當(dāng)設(shè)備恢復(fù)正常狀態(tài)時(shí),回調(diào) const struct dev_pm_ops *pm; //電源管理相關(guān) const struct iommu_ops *iommu_ops; struct subsys_private *p; //子系統(tǒng)私有類型 struct lock_class_key lock_key;};struct subsys_private { struct kset subsys; //總線kset,scsi子系統(tǒng)對應(yīng)/sys/bus/scsi struct kset *devices_kset; //設(shè)備kset, scsi子系統(tǒng)對應(yīng)/sys/bus/scsi/devices struct list_head interfaces; //總線的接口鏈表 struct mutex mutex; struct kset *drivers_kset; //驅(qū)動kset, scsi子系統(tǒng)對應(yīng)/sys/bus/scsi/drivers struct klist klist_devices; //總線的設(shè)備鏈表 struct klist klist_drivers; //總線的驅(qū)動鏈表 struct blocking_notifier_head bus_notifier; //子系統(tǒng)變化時(shí),需要通知的鏈表 unsigned int drivers_autoprobe:1; //是否允許設(shè)備或驅(qū)動加載時(shí),自動探測 struct bus_type *bus; //指向總線類型 struct kset glue_dirs; struct class *class; //指向總線類型的類};
從上面的兩個(gè)結(jié)構(gòu)可以看到,bus_type包含的主要是實(shí)現(xiàn)子系統(tǒng)應(yīng)該具體關(guān)注的比如name,一組回調(diào)函數(shù)。而subsys_private結(jié)構(gòu)主要是設(shè)備驅(qū)動模型中的關(guān)系的表達(dá),如字段subsys的類型是kset,描述該子系統(tǒng)在sysfs中的表達(dá);klist_devices和klist_drivers分別是設(shè)備鏈表和驅(qū)動鏈表,用于管理總線類型的所有設(shè)備和驅(qū)動。之后仍然會遇到xxx_private的結(jié)構(gòu),以這種方式命名的結(jié)構(gòu),都是給設(shè)備驅(qū)動模型核心使用的,業(yè)務(wù)子系統(tǒng)無需也不能使用。
實(shí)現(xiàn)子系統(tǒng)的第一步就是創(chuàng)建bus_type,并將其注冊到系統(tǒng),此時(shí)需要調(diào)用bus_register:
/** * bus_register - register a driver-core subsystem * @bus: bus to register * * Once we have that, we register the bus with the kobject * infrastructure, then register the children subsystems it has: * the devices and drivers that belong to the subsystem. */int bus_register(struct bus_type *bus){ int retval; struct subsys_private *priv; struct lock_class_key *key = &bus->lock_key; //分配總線類型私有數(shù)據(jù)空間 priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->bus = bus; //關(guān)聯(lián)bus_type和subsys_private bus->p = priv; BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier); //設(shè)置總線類型名稱到kobject中,在sysfs中顯示 retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name); if (retval) goto out; priv->subsys.kobj.kset = bus_kset; priv->subsys.kobj.ktype = &bus_ktype; priv->drivers_autoprobe = 1; //開啟自動探測 retval = kset_register(&priv->subsys); //將總線類型添加到設(shè)備模型中 if (retval) goto out; retval = bus_create_file(bus, &bus_attr_uevent); //創(chuàng)建uevent屬性文件 if (retval) goto bus_uevent_fail; priv->devices_kset = kset_create_and_add("devices", NULL, //創(chuàng)建devices目錄 &priv->subsys.kobj); if (!priv->devices_kset) { retval = -ENOMEM; goto bus_devices_fail; } priv->drivers_kset = kset_create_and_add("drivers", NULL, //創(chuàng)建drivers目錄 &priv->subsys.kobj); if (!priv->drivers_kset) { retval = -ENOMEM; goto bus_drivers_fail; } //初始化鏈表和鎖 INIT_LIST_HEAD(&priv->interfaces); __mutex_init(&priv->mutex, "subsys mutex", key); klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put); klist_init(&priv->klist_drivers, NULL, NULL); retval = add_probe_files(bus); //在sysfs中添加探測文件drivers_autoprobe和drivers_probe if (retval) goto bus_probe_files_fail; retval = bus_add_groups(bus, bus->bus_groups); //添加總線類型的屬性文件 if (retval) goto bus_groups_fail; pr_debug("bus: '%s': registered\n", bus->name); return 0; //失敗回滾操作bus_groups_fail: remove_probe_files(bus);bus_probe_files_fail: kset_unregister(bus->p->drivers_kset);bus_drivers_fail: kset_unregister(bus->p->devices_kset);bus_devices_fail: bus_remove_file(bus, &bus_attr_uevent);bus_uevent_fail: kset_unregister(&bus->p->subsys);out: kfree(bus->p); bus->p = NULL; return retval;}EXPORT_SYMBOL_GPL(bus_register);
注冊總線類型后,便可以在系統(tǒng)看到:
root@ubuntu16:~# ls /sys/bus/scsi -ltotal 0drwxr-xr-x 2 root root 0 Sep 5 16:01 devicesdrwxr-xr-x 4 root root 0 Sep 2 09:44 drivers-rw-r--r-- 1 root root 4096 Sep 5 11:29 drivers_autoprobe--w------- 1 root root 4096 Sep 5 11:29 drivers_probe--w------- 1 root root 4096 Sep 2 09:44 ueventroot@ubuntu16:~#
當(dāng)從系統(tǒng)中注銷子系統(tǒng)時(shí),需要調(diào)用bus_unregister,完成總線類型的反注冊:
/** * bus_unregister - remove a bus from the system * @bus: bus. * * Unregister the child subsystems and the bus itself. * Finally, we call bus_put() to release the refcount */void bus_unregister(struct bus_type *bus){ pr_debug("bus: '%s': unregistering\n", bus->name); if (bus->dev_root) device_unregister(bus->dev_root); //刪除根設(shè)備 bus_remove_groups(bus, bus->bus_groups); //刪除總線的屬性文件 remove_probe_files(bus); //刪除探測文件drivers_autoprobe和drivers_probe kset_unregister(bus->p->drivers_kset); //刪除drivers目錄 kset_unregister(bus->p->devices_kset); //刪除devices目錄 bus_remove_file(bus, &bus_attr_uevent); //刪除uevent文件 kset_unregister(&bus->p->subsys); //刪除總線目錄}EXPORT_SYMBOL_GPL(bus_unregister);
struct device { struct device *parent; //指向父設(shè)備,eg.HBA struct device_private *p; //設(shè)備私有指針 struct kobject kobj; //內(nèi)嵌kobject const char *init_name; /* initial name of the device */ const struct device_type *type; //設(shè)備類型,抽象出來的域和方法 struct mutex mutex; /* mutex to synchronize calls to its driver */ struct bus_type *bus; /* type of bus device is on; devive歸屬的bus */ struct device_driver *driver; /* which driver has allocated this device */ void *platform_data; /* Platform specific data, device core doesn't touch it */ void *driver_data; /* Driver data, set and get with dev_set/get_drvdata */ struct dev_pm_info power; struct dev_pm_domain *pm_domain;#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN struct irq_domain *msi_domain;#endif#ifdef CONFIG_PINCTRL struct dev_pin_info *pins;#endif#ifdef CONFIG_GENERIC_MSI_IRQ struct list_head msi_list;#endif#ifdef CONFIG_NUMA int numa_node; /* NUMA node this device is close to */#endif u64 *dma_mask; /* dma mask (if dma'able device) */ u64 coherent_dma_mask;/* Like dma_mask, but for alloc_coherent mappings as not all hardware supports 64 bit addresses for consistent allocations such descriptors. */ unsigned long dma_pfn_offset; struct device_dma_parameters *dma_parms; struct list_head dma_pools; /* dma pools (if dma'ble) */ struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */#ifdef CONFIG_DMA_CMA struct cma *cma_area; /* contiguous memory area for dma allocations */#endif /* arch specific additions */ struct dev_archdata archdata; struct device_node *of_node; /* associated device tree node */ struct fwnode_handle *fwnode; /* firmware device node */ dev_t devt; /* dev_t, creates the sysfs "dev"; 設(shè)備號 */ u32 id; /* device instance */ spinlock_t devres_lock; struct list_head devres_head; //設(shè)備資源鏈表頭 struct klist_node knode_class; //鏈入類的設(shè)備鏈表 struct class *class; //指向鏈入的類 const struct attribute_group **groups; /* optional groups 設(shè)備特有的屬性 */ void (*release)(struct device *dev); //設(shè)備是否回調(diào) struct iommu_group *iommu_group; bool offline_disabled:1; bool offline:1;};struct device_private { struct klist klist_children; //子設(shè)備鏈表 struct klist_node knode_parent; //鏈入父設(shè)備的children鏈表 struct klist_node knode_driver; //鏈入驅(qū)動的設(shè)備鏈表中 struct klist_node knode_bus; //鏈入總線的設(shè)備鏈表 struct list_head deferred_probe; //鏈入延遲探測鏈表 struct device *device; //指向關(guān)聯(lián)的device};struct device_type { const char *name; //設(shè)備類型的名稱 const struct attribute_group **groups; //設(shè)備的公有屬性組 int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前調(diào)用,用于設(shè)置事件環(huán)境變量 char *(*devnode)(struct device *dev, umode_t *mode, //在創(chuàng)建設(shè)備時(shí),提供名字線索 kuid_t *uid, kgid_t *gid); void (*release)(struct device *dev); //設(shè)備釋放時(shí)回調(diào) const struct dev_pm_ops *pm;};
在設(shè)備驅(qū)動模型中,device結(jié)構(gòu)有bus域,指向device所屬的總線類型;class域指向device所屬的唯一的類;driver域指向設(shè)備所綁定的驅(qū)動。與內(nèi)核對象一樣,設(shè)備也被組織層層次結(jié)構(gòu),通過parent指向父設(shè)備。
device_private結(jié)構(gòu)由設(shè)備驅(qū)動模型處理,維護(hù)和其他結(jié)構(gòu)之間的內(nèi)部關(guān)系。device_type結(jié)構(gòu)定義設(shè)備公有的屬性和方法。
當(dāng)設(shè)備被發(fā)現(xiàn)后,需要將設(shè)備注冊到系統(tǒng),需要調(diào)用device_register函數(shù):
/** * device_register - register a device with the system. * @dev: pointer to the device structure * * This happens in two clean steps - initialize the device * and add it to the system. The two steps can be called * separately, but this is the easiest and most common. * I.e. you should only call the two helpers separately if * have a clearly defined need to use and refcount the device * before it is added to the hierarchy. * * For more information, see the kerneldoc for device_initialize() * and device_add(). * * NOTE: _Never_ directly free @dev after calling this function, even * if it returned an error! Always use put_device() to give up the * reference initialized in this function instead. */int device_register(struct device *dev){ device_initialize(dev); //初始化device結(jié)構(gòu) return device_add(dev); //將設(shè)備添加到系統(tǒng)}EXPORT_SYMBOL_GPL(device_register);void device_initialize(struct device *dev){ dev->kobj.kset = devices_kset; // /sys/devices/ kobject_init(&dev->kobj, &device_ktype); // device的類型為device_ktype INIT_LIST_HEAD(&dev->dma_pools); mutex_init(&dev->mutex); lockdep_set_novalidate_class(&dev->mutex); spin_lock_init(&dev->devres_lock); INIT_LIST_HEAD(&dev->devres_head); device_pm_init(dev); set_dev_node(dev, -1);#ifdef CONFIG_GENERIC_MSI_IRQ INIT_LIST_HEAD(&dev->msi_list);#endif}EXPORT_SYMBOL_GPL(device_initialize);
device_register函數(shù)調(diào)用device_initialize對device結(jié)構(gòu)進(jìn)行初始化,調(diào)用device_add函數(shù)完成設(shè)備添加到系統(tǒng)。
int device_add(struct device *dev){ struct device *parent = NULL; struct kobject *kobj; struct class_interface *class_intf; int error = -EINVAL; dev = get_device(dev); if (!dev) goto done; if (!dev->p) { //如果device沒有設(shè)置devcie_private,在這里分配并初始化 error = device_private_init(dev); if (error) goto done; } /* * for statically allocated devices, which should all be converted * some day, we need to initialize the name. We prevent reading back * the name, and force the use of dev_name() */ if (dev->init_name) { dev_set_name(dev, "%s", dev->init_name); //設(shè)置device的kobject名字 dev->init_name = NULL; } /* subsystems can specify simple device enumeration */ if (!dev_name(dev) && dev->bus && dev->bus->dev_name) //如果device沒有設(shè)置init_name, 則使用bus的dev_name和設(shè)備id生成 dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id); if (!dev_name(dev)) { error = -EINVAL; goto name_error; } pr_debug("device: '%s': %s\n", dev_name(dev), __func__); parent = get_device(dev->parent); kobj = get_device_parent(dev, parent); if (kobj) dev->kobj.parent = kobj; //設(shè)置device的kobject的parent字段 /* use parent numa_node */ if (parent && (dev_to_node(dev) == NUMA_NO_NODE)) set_dev_node(dev, dev_to_node(parent)); /* first, register with generic layer. */ /* we require the name to be set before, and pass NULL */ error = kobject_add(&dev->kobj, dev->kobj.parent, NULL); //將device添加到parent的目錄中 if (error) goto Error; /* notify platform of device entry */ if (platform_notify) platform_notify(dev); error = device_create_file(dev, &dev_attr_uevent); //在設(shè)備目錄下創(chuàng)建uevent文件 if (error) goto attrError; error = device_add_class_symlinks(dev); //為設(shè)備創(chuàng)建和類相關(guān)的符號鏈接 if (error) goto SymlinkError; error = device_add_attrs(dev); //為設(shè)備的默認(rèn)屬性添加對應(yīng)的文件 if (error) goto AttrsError; error = bus_add_device(dev); //將device添加到bus_type if (error) goto BusError; error = dpm_sysfs_add(dev); if (error) goto DPMError; device_pm_add(dev); if (MAJOR(dev->devt)) { error = device_create_file(dev, &dev_attr_dev); //在設(shè)備目錄下創(chuàng)建dev屬性對應(yīng)文件,用于保存設(shè)備號 if (error) goto DevAttrError; error = device_create_sys_dev_entry(dev); //在/sys/block和/sys/char創(chuàng)建一個(gè)到設(shè)備所在目錄的符號鏈接 if (error) goto SysEntryError; devtmpfs_create_node(dev); //在/dev下創(chuàng)建設(shè)備文件 } /* Notify clients of device addition. This call must come * after dpm_sysfs_add() and before kobject_uevent(). */ if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_ADD_DEVICE, dev); kobject_uevent(&dev->kobj, KOBJ_ADD); //發(fā)送設(shè)備ADD事件 bus_probe_device(dev); //嘗試將device綁定到device_driver if (parent) //如果指定了parent,將設(shè)備添加到parent的孩子鏈表中 klist_add_tail(&dev->p->knode_parent, &parent->p->klist_children); if (dev->class) { //如果設(shè)置了class,將設(shè)備添加到類的設(shè)備鏈表 mutex_lock(&dev->class->p->mutex); /* tie the class to the device */ klist_add_tail(&dev->knode_class, &dev->class->p->klist_devices); /* notify any interfaces that the device is here */ list_for_each_entry(class_intf, //調(diào)用device所屬的class中所有class_interface的add_dev &dev->class->p->interfaces, node) if (class_intf->add_dev) class_intf->add_dev(dev, class_intf); mutex_unlock(&dev->class->p->mutex); }done: put_device(dev); return error; SysEntryError: if (MAJOR(dev->devt)) device_remove_file(dev, &dev_attr_dev); DevAttrError: device_pm_remove(dev); dpm_sysfs_remove(dev); DPMError: bus_remove_device(dev); BusError: device_remove_attrs(dev); AttrsError: device_remove_class_symlinks(dev); SymlinkError: device_remove_file(dev, &dev_attr_uevent); attrError: kobject_uevent(&dev->kobj, KOBJ_REMOVE); kobject_del(&dev->kobj); Error: cleanup_device_parent(dev); put_device(parent);name_error: kfree(dev->p); dev->p = NULL; goto done;}EXPORT_SYMBOL_GPL(device_add);
設(shè)備添加到系統(tǒng)主要流程都在device_add函數(shù)實(shí)現(xiàn),上面代碼的注釋基本把主要函數(shù)的作用進(jìn)行了描述。值得關(guān)注的一個(gè)函數(shù)便是bus_probe_device,該函數(shù)完成將設(shè)備綁定到驅(qū)動的動作。
void bus_probe_device(struct device *dev){ struct bus_type *bus = dev->bus; struct subsys_interface *sif; if (!bus) return; if (bus->p->drivers_autoprobe) //如果bus允許自動探測 device_initial_probe(dev); //主要功能 mutex_lock(&bus->p->mutex); list_for_each_entry(sif, &bus->p->interfaces, node) //將設(shè)備綁定到接口 if (sif->add_dev) sif->add_dev(dev, sif); mutex_unlock(&bus->p->mutex);}void device_initial_probe(struct device *dev){ __device_attach(dev, true);}static int __device_attach(struct device *dev, bool allow_async){ int ret = 0; device_lock(dev); if (dev->driver) { //指定了device所要綁定的driver if (klist_node_attached(&dev->p->knode_driver)) { //檢查knode_driver是否綁定到鏈表 ret = 1; goto out_unlock; } ret = device_bind_driver(dev); //綁定,修改相應(yīng)鏈表 if (ret == 0) ret = 1; else { dev->driver = NULL; ret = 0; } } else { //沒有指定device要綁定的driver struct device_attach_data data = { .dev = dev, .check_async = allow_async, .want_async = false, }; if (dev->parent) pm_runtime_get_sync(dev->parent); //遍歷bus中所有驅(qū)動,嘗試attach ret = bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver); if (!ret && allow_async && data.have_async) { /* * If we could not find appropriate driver * synchronously and we are allowed to do * async probes and there are drivers that * want to probe asynchronously, we'll * try them. */ dev_dbg(dev, "scheduling asynchronous probe\n"); get_device(dev); async_schedule(__device_attach_async_helper, dev); } else { pm_request_idle(dev); } if (dev->parent) pm_runtime_put(dev->parent); }out_unlock: device_unlock(dev); return ret;}
通過上面3個(gè)函數(shù)的追蹤,__device_attach函數(shù)遍歷bus所有的驅(qū)動,嘗試執(zhí)行attach,具體調(diào)用__device_attach_driver函數(shù)。
static int __device_attach_driver(struct device_driver *drv, void *_data){ struct device_attach_data *data = _data; struct device *dev = data->dev; bool async_allowed; /* * Check if device has already been claimed. This may * happen with driver loading, device discovery/registration, * and deferred probe processing happens all at once with * multiple threads. */ if (dev->driver) return -EBUSY; if (!driver_match_device(drv, dev)) //調(diào)用bus的match函數(shù),測試是否匹配 return 0; //進(jìn)一步probe設(shè)備,需要設(shè)備已經(jīng)注冊 async_allowed = driver_allows_async_probing(drv); if (async_allowed) data->have_async = true; //如果允許異步探測,則先返回 if (data->check_async && async_allowed != data->want_async) return 0; return driver_probe_device(drv, dev);}int driver_probe_device(struct device_driver *drv, struct device *dev){ int ret = 0; if (!device_is_registered(dev)) //檢查device是否register return -ENODEV; pr_debug("bus: '%s': %s: matched device %s with driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); if (dev->parent) pm_runtime_get_sync(dev->parent); pm_runtime_barrier(dev); ret = really_probe(dev, drv); //真正執(zhí)行探測 pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); return ret;}
從上面兩個(gè)函數(shù)來看,真正執(zhí)行probe的函數(shù)是really_probe。
//返回1表示成功,返回0表示中間步驟出現(xiàn)異常,已回滾所有操作。static int really_probe(struct device *dev, struct device_driver *drv){ int ret = 0; int local_trigger_count = atomic_read(&deferred_trigger_count); atomic_inc(&probe_count); pr_debug("bus: '%s': %s: probing driver %s with device %s\n", drv->bus->name, __func__, drv->name, dev_name(dev)); WARN_ON(!list_empty(&dev->devres_head)); dev->driver = drv; //將設(shè)備的driver指向當(dāng)前驅(qū)動 /* If using pinctrl, bind pins now before probing */ ret = pinctrl_bind_pins(dev); if (ret) goto probe_failed; if (driver_sysfs_add(dev)) { //在sysfs驅(qū)動目錄中創(chuàng)建指向設(shè)備的符號鏈接,同時(shí)在設(shè)備目錄中創(chuàng)建指向驅(qū)動的符號鏈接 printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n", __func__, dev_name(dev)); goto probe_failed; } if (dev->pm_domain && dev->pm_domain->activate) { ret = dev->pm_domain->activate(dev); if (ret) goto probe_failed; } /* * Ensure devices are listed in devices_kset in correct order * It's important to move Dev to the end of devices_kset before * calling .probe, because it could be recursive and parent Dev * should always go first */ devices_kset_move_last(dev); if (dev->bus->probe) { ret = dev->bus->probe(dev); //優(yōu)先調(diào)用bus_type中的probe方法 if (ret) goto probe_failed; } else if (drv->probe) { ret = drv->probe(dev); //其次,調(diào)用driver中的probe方法 if (ret) goto probe_failed; } pinctrl_init_done(dev); if (dev->pm_domain && dev->pm_domain->sync) dev->pm_domain->sync(dev); driver_bound(dev); //將設(shè)備鏈入驅(qū)動的設(shè)備鏈表 ret = 1; pr_debug("bus: '%s': %s: bound device %s to driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); goto done;probe_failed: //探測失敗, 回滾操作 devres_release_all(dev); driver_sysfs_remove(dev); dev->driver = NULL; dev_set_drvdata(dev, NULL); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); switch (ret) { case -EPROBE_DEFER: /* Driver requested deferred probing */ dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name); driver_deferred_probe_add(dev); /* Did a trigger occur while probing? Need to re-trigger if yes */ if (local_trigger_count != atomic_read(&deferred_trigger_count)) driver_deferred_probe_trigger(); break; case -ENODEV: case -ENXIO: pr_debug("%s: probe of %s rejects match %d\n", drv->name, dev_name(dev), ret); break; default: /* driver matched but the probe failed */ printk(KERN_WARNING "%s: probe of %s failed with error %d\n", drv->name, dev_name(dev), ret); } /* * Ignore errors returned by ->probe so that the next driver can try * its luck. */ ret = 0;done: atomic_dec(&probe_count); wake_up(&probe_waitqueue); return ret;}
到此,設(shè)備添加到系統(tǒng)的主要流程便基本清楚,不再往下跟蹤。
struct device_driver { const char *name; //driver名稱 struct bus_type *bus; //driver所屬的bus_type struct module *owner; const char *mod_name; /* used for built-in modules */ bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ enum probe_type probe_type; const struct of_device_id *of_match_table; const struct acpi_device_id *acpi_match_table; int (*probe) (struct device *dev); //在device綁定到driver之前,對device進(jìn)行初始化 int (*remove) (struct device *dev); //在device解綁到driver時(shí),回調(diào) void (*shutdown) (struct device *dev); int (*suspend) (struct device *dev, pm_message_t state); int (*resume) (struct device *dev); const struct attribute_group **groups; //driver的屬性 const struct dev_pm_ops *pm; //電源相關(guān) struct driver_private *p; //driver私有結(jié)構(gòu)};struct driver_private { struct kobject kobj; struct klist klist_devices; //driver所支持的device鏈表 struct klist_node knode_bus; //鏈入bus_type的驅(qū)動鏈表中 struct module_kobject *mkobj; struct device_driver *driver; //指向driver};
device_driver結(jié)構(gòu)中,bus域指向驅(qū)動所屬的總線類型,knode_bus域用于鏈入總線類型的驅(qū)動鏈表。driver_private結(jié)構(gòu)中的klist_devices域用于鏈接所有綁定到本驅(qū)動的設(shè)備。
驅(qū)動在加載時(shí),需要將其注冊到總線類型,調(diào)用driver_register實(shí)現(xiàn):
int driver_register(struct device_driver *drv){ int ret; struct device_driver *other; BUG_ON(!drv->bus->p); //確保bus已經(jīng)注冊到驅(qū)動模型中 //如果bus_type和driver都實(shí)現(xiàn)了同一個(gè)回調(diào),優(yōu)先使用bus_type的回調(diào)函數(shù),打印告警信息 if ((drv->bus->probe && drv->probe) || (drv->bus->remove && drv->remove) || (drv->bus->shutdown && drv->shutdown)) printk(KERN_WARNING "Driver '%s' needs updating - please use " "bus_type methods\n", drv->name); other = driver_find(drv->name, drv->bus); //根據(jù)名字查找驅(qū)動 if (other) { printk(KERN_ERR "Error: Driver '%s' is already registered, " "aborting...\n", drv->name); return -EBUSY; } ret = bus_add_driver(drv); //將driver添加到bus if (ret) return ret; ret = driver_add_groups(drv, drv->groups); //創(chuàng)建driver屬性文件 if (ret) { bus_remove_driver(drv); return ret; } kobject_uevent(&drv->p->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間 return ret;}EXPORT_SYMBOL_GPL(driver_register);
添加driver到bus_type,由bus_add_driver實(shí)現(xiàn):
int bus_add_driver(struct device_driver *drv){ struct bus_type *bus; struct driver_private *priv; int error = 0; bus = bus_get(drv->bus); if (!bus) return -EINVAL; pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name); priv = kzalloc(sizeof(*priv), GFP_KERNEL); //分配driver_private結(jié)構(gòu)空間 if (!priv) { error = -ENOMEM; goto out_put_bus; } klist_init(&priv->klist_devices, NULL, NULL); //初始化driver設(shè)備鏈表 priv->driver = drv; //關(guān)聯(lián)device_driver和driver_private drv->p = priv; priv->kobj.kset = bus->p->drivers_kset; //driver_private中的kobj的kset域指向subsys中的drivers_kset error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, //添加driver到sysfs "%s", drv->name); if (error) goto out_unregister; klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers); //添加driver到bus的驅(qū)動鏈表中 if (drv->bus->p->drivers_autoprobe) { //自動探測 if (driver_allows_async_probing(drv)) { //允許異步執(zhí)行probe pr_debug("bus: '%s': probing driver %s asynchronously\n", drv->bus->name, drv->name); async_schedule(driver_attach_async, drv); //異步probe } else { error = driver_attach(drv); //同步probe if (error) goto out_unregister; } } module_add_driver(drv->owner, drv); //驅(qū)動實(shí)現(xiàn)的模塊 error = driver_create_file(drv, &driver_attr_uevent); //在driver中添加uevent屬性文件 if (error) { printk(KERN_ERR "%s: uevent attr (%s) failed\n", __func__, drv->name); } error = driver_add_groups(drv, bus->drv_groups); //添加driver的屬性文件 if (error) { /* How the hell do we get out of this pickle? Give up */ printk(KERN_ERR "%s: driver_create_groups(%s) failed\n", __func__, drv->name); } if (!drv->suppress_bind_attrs) { error = add_bind_files(drv); //在driver目錄添加的bind和unbind兩個(gè)屬性文件 if (error) { /* Ditto */ printk(KERN_ERR "%s: add_bind_files(%s) failed\n", __func__, drv->name); } } return 0;out_unregister: kobject_put(&priv->kobj); kfree(drv->p); drv->p = NULL;out_put_bus: bus_put(bus); return error;}
bus_add_driver函數(shù)完成驅(qū)動添加到總線類型,當(dāng)驅(qū)動添加完成后,如果總線類型設(shè)置了允許自動探測標(biāo)志drivers_autoprobe,便可以根據(jù)是否允許異步探測調(diào)用driver_attach_async或driver_attach,driver_attach_async也是調(diào)用driver_attach:
int driver_attach(struct device_driver *drv){ return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);}EXPORT_SYMBOL_GPL(driver_attach);static int __driver_attach(struct device *dev, void *data){ struct device_driver *drv = data; /* * Lock device and try to bind to it. We drop the error * here and always return 0, because we need to keep trying * to bind to devices and some drivers will return an error * simply if it didn't support the device. * * driver_probe_device() will spit a warning if there * is an error. */ if (!driver_match_device(drv, dev)) //調(diào)用bus_type.match return 0; if (dev->parent) /* Needed for USB */ device_lock(dev->parent); device_lock(dev); if (!dev->driver) driver_probe_device(drv, dev); //完成probe的主要函數(shù) device_unlock(dev); if (dev->parent) device_unlock(dev->parent); return 0;}int driver_probe_device(struct device_driver *drv, struct device *dev){ int ret = 0; if (!device_is_registered(dev)) //檢查device是否register return -ENODEV; pr_debug("bus: '%s': %s: matched device %s with driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); if (dev->parent) pm_runtime_get_sync(dev->parent); pm_runtime_barrier(dev); ret = really_probe(dev, drv); //真正執(zhí)行探測 pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); return ret;}
根據(jù)上面3個(gè)函數(shù),最終仍然是調(diào)用前面描述過的really_probe函數(shù)完成最后的探測。
到這里驅(qū)動注冊完成,結(jié)合之前的設(shè)備注冊流程,無論是驅(qū)動注冊或是設(shè)備注冊,只要總線類型設(shè)置了自動探測標(biāo)志位,這兩個(gè)流程都會執(zhí)行探測。所以設(shè)備發(fā)現(xiàn)與驅(qū)動的加載順序已經(jīng)不再重要,也是通過這種雙向探測方式,Linux內(nèi)核支持設(shè)備的熱拔插機(jī)制。
驅(qū)動卸載時(shí),需要調(diào)用driver_unregister函數(shù),使driver脫離總線類型:
void driver_unregister(struct device_driver *drv){ if (!drv || !drv->p) { WARN(1, "Unexpected driver unregister!\n"); return; } driver_remove_groups(drv, drv->groups); //刪除驅(qū)動的屬性文件 bus_remove_driver(drv); //從總線類型中移除驅(qū)動}EXPORT_SYMBOL_GPL(driver_unregister);void bus_remove_driver(struct device_driver *drv){ if (!drv->bus) return; if (!drv->suppress_bind_attrs) remove_bind_files(drv); //刪除驅(qū)動目錄下bind和unbind文件 driver_remove_groups(drv, drv->bus->drv_groups); //刪除總線類型的驅(qū)動屬性文件 driver_remove_file(drv, &driver_attr_uevent); //刪除驅(qū)動目錄下uevent文件 klist_remove(&drv->p->knode_bus); //從總線類型的驅(qū)動鏈表中移除驅(qū)動 pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name); driver_detach(drv); //驅(qū)動與所有綁定的設(shè)備進(jìn)行解綁 module_remove_driver(drv); //驅(qū)動實(shí)現(xiàn)的模塊 kobject_put(&drv->p->kobj); //減少引用計(jì)數(shù) bus_put(drv->bus);}
struct class { const char *name; //類名稱 struct module *owner; //指向?qū)崿F(xiàn)這個(gè)類的模塊的指針 struct class_attribute *class_attrs; //類公共屬性 const struct attribute_group **dev_groups; //歸屬與該類的設(shè)備的默認(rèn)屬性 struct kobject *dev_kobj; //類鏈入sysfs的kobject int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置類的特定環(huán)境變量 char *(*devnode)(struct device *dev, umode_t *mode); //創(chuàng)建設(shè)備時(shí),返回設(shè)備名稱 void (*class_release)(struct class *class); //類釋放時(shí)回調(diào) void (*dev_release)(struct device *dev); //設(shè)備釋放時(shí)回調(diào) int (*suspend)(struct device *dev, pm_message_t state); //設(shè)備進(jìn)入睡眠狀態(tài)時(shí),回調(diào) int (*resume)(struct device *dev); //設(shè)備被喚醒時(shí),回調(diào) const struct kobj_ns_type_operations *ns_type; //sysfs支持命名空間 const void *(*namespace)(struct device *dev); //返回設(shè)備所在的命名空間 const struct dev_pm_ops *pm; //電源相關(guān) struct subsys_private *p; //類所屬的子系統(tǒng)私有數(shù)據(jù)結(jié)構(gòu)};
類的私有數(shù)據(jù)類型與總線類型的私有數(shù)據(jù)類型都是subsys_private,這里將不再重復(fù)描述。
子系統(tǒng)需要使用類時(shí),需要調(diào)用class_register函數(shù)向總線類型注冊類:
#define class_register(class) ({ static struct lock_class_key __key; __class_register(class, &__key); })int __class_register(struct class *cls, struct lock_class_key *key){ struct subsys_private *cp; int error; pr_debug("device class '%s': registering\n", cls->name); cp = kzalloc(sizeof(*cp), GFP_KERNEL); //分配私有數(shù)據(jù)空間 if (!cp) return -ENOMEM; klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put); //初始化該class的device鏈表 INIT_LIST_HEAD(&cp->interfaces); //初始化接口鏈表 kset_init(&cp->glue_dirs); __mutex_init(&cp->mutex, "subsys mutex", key); error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name); //將在/sys/class/目錄下顯示該名稱 if (error) { kfree(cp); return error; } /* set the default /sys/dev directory for devices of this class */ if (!cls->dev_kobj) cls->dev_kobj = sysfs_dev_char_kobj;#if defined(CONFIG_BLOCK) /* let the block class directory show up in the root of sysfs */ if (!sysfs_deprecated || cls != &block_class) cp->subsys.kobj.kset = class_kset;#else cp->subsys.kobj.kset = class_kset; // 全局變量class_kset指的是 /sys/class/#endif cp->subsys.kobj.ktype = &class_ktype; cp->class = cls; //class與subsys_private關(guān)聯(lián) cls->p = cp; error = kset_register(&cp->subsys); //在/sys/class/目錄下創(chuàng)建該類對應(yīng)的目錄 if (error) { kfree(cp); return error; } error = add_class_attrs(class_get(cls)); //在/sys/class/xxx/目錄下創(chuàng)建類屬性文件 class_put(cls); return error;}EXPORT_SYMBOL_GPL(__class_register);
類的注冊比較簡單,注釋已經(jīng)比較詳細(xì)。當(dāng)子系統(tǒng)需要卸載類時(shí),需要調(diào)用class_register函數(shù):
void class_unregister(struct class *cls){ pr_debug("device class '%s': unregistering\n", cls->name); remove_class_attrs(cls); //刪除/sys/class/xxx/目錄下的類屬性文件 kset_unregister(&cls->p->subsys); //刪除/sys/class/目錄}
struct class_interface { struct list_head node; //鏈入class中 struct class *class; //指向所屬的class //在接口被添加或者設(shè)備被添加到接口所在的類時(shí),從接口中添加或刪除設(shè)備 int (*add_dev) (struct device *, struct class_interface *); void (*remove_dev) (struct device *, struct class_interface *);};
向類中注冊接口,需要調(diào)用class_interface_register函數(shù)完成:
int class_interface_register(struct class_interface *class_intf){ struct class *parent; struct class_dev_iter iter; struct device *dev; if (!class_intf || !class_intf->class) //確保class和class_interface都存在 return -ENODEV; parent = class_get(class_intf->class); //增加引用計(jì)數(shù),并返回接口所屬的class if (!parent) return -EINVAL; mutex_lock(&parent->p->mutex); list_add_tail(&class_intf->node, &parent->p->interfaces); //將class_interface添加到class的接口鏈表 if (class_intf->add_dev) { //如果接口設(shè)置了add_dev方法,對該class的所有device調(diào)用 class_dev_iter_init(&iter, parent, NULL, NULL); while ((dev = class_dev_iter_next(&iter))) class_intf->add_dev(dev, class_intf); //接口方法作用于設(shè)備 class_dev_iter_exit(&iter); } mutex_unlock(&parent->p->mutex); return 0;}
從類中刪除接口,需要調(diào)用class_interface_unregister函數(shù)完成:
void class_interface_unregister(struct class_interface *class_intf){ struct class *parent = class_intf->class; struct class_dev_iter iter; struct device *dev; if (!parent) return; mutex_lock(&parent->p->mutex); list_del_init(&class_intf->node); //將class_interface從class的接口鏈表中刪除 if (class_intf->remove_dev) { //如果接口設(shè)置了remove_dev方法,對該class的所有device調(diào)用 class_dev_iter_init(&iter, parent, NULL, NULL); while ((dev = class_dev_iter_next(&iter))) class_intf->remove_dev(dev, class_intf); //接口方法作用于設(shè)備 class_dev_iter_exit(&iter); } mutex_unlock(&parent->p->mutex); class_put(parent);}
Linux設(shè)備驅(qū)動模型已經(jīng)將每種對象的關(guān)系,sysfs的呈現(xiàn)方式已經(jīng)實(shí)現(xiàn)了。實(shí)現(xiàn)子系統(tǒng)只需要定義業(yè)務(wù)自身的總線類型, 設(shè)備, 驅(qū)動, 類, 接口分別”繼承”bus_type, device, driver, class, class_interface。并根據(jù)具體業(yè)務(wù)實(shí)現(xiàn)各個(gè)結(jié)構(gòu)規(guī)定的回調(diào)函數(shù)。最后調(diào)用上述的注冊函數(shù)添加到系統(tǒng),便完成子系統(tǒng)的開發(fā)。
低層:代表與SCSI的物理接口的實(shí)際驅(qū)動器,例如各個(gè)廠商為其特定的主機(jī)適配器(Host Bus Adapter, HBA)開發(fā)的驅(qū)動,低層驅(qū)動主要作用是發(fā)現(xiàn)連接到主機(jī)適配器的scsi設(shè)備,在內(nèi)存中構(gòu)建scsi子系統(tǒng)所需的數(shù)據(jù)結(jié)構(gòu),并提供消息傳遞接口,將scsi命令的接受與發(fā)送解釋為主機(jī)適配器的操作。
高層: 代表各種scsi設(shè)備類型的驅(qū)動,如scsi磁盤驅(qū)動,scsi磁帶驅(qū)動,高層驅(qū)動認(rèn)領(lǐng)低層驅(qū)動發(fā)現(xiàn)的scsi設(shè)備,為這些設(shè)備分配名稱,將對設(shè)備的IO轉(zhuǎn)換為scsi命令,交由低層驅(qū)動處理。
中層:包含scsi棧的公共服務(wù)函數(shù)。高層和低層通過調(diào)用中層的函數(shù)完成其功能,而中層在執(zhí)行過程中,也需要調(diào)用高層和低層注冊的回調(diào)函數(shù)做一些個(gè)性化處理。
Linux SCSI模型是內(nèi)核的抽象,主機(jī)適配器連接主機(jī)IO總線(如PCI總線)和存儲IO總線(如SCSI總線)。一臺計(jì)算機(jī)可以有多個(gè)主機(jī)適配器,而主機(jī)適配器可以控制一條或多條SCSI總線,一條總線可以有多個(gè)目標(biāo)節(jié)點(diǎn)與之相連,并且一個(gè)目標(biāo)節(jié)點(diǎn)可以有多個(gè)邏輯單元。
在Linux SCSI子系統(tǒng)中,內(nèi)核中的目標(biāo)節(jié)點(diǎn)(target)對應(yīng)SCSI磁盤,SCSI磁盤中可以有多個(gè)邏輯單元,統(tǒng)一由磁盤控制器控制,這些邏輯單元才是真正作為IO終點(diǎn)的存儲設(shè)備,內(nèi)核用設(shè)備(device)對邏輯單元進(jìn)行抽象;內(nèi)核中的Host對應(yīng)主機(jī)適配器(物理的HBA/RAID卡,虛擬的iscsi target)
內(nèi)核使用四元組 來唯一標(biāo)識一個(gè)scsi的邏輯單元,在sysfs中查看sda磁盤<2:0:0:0>顯示如下:
root@ubuntu16:/home/comet/Costor/bin# ls /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/alignment_offset device events_poll_msecs integrity removable sda5 subsystembdi discard_alignment ext_range power ro size tracecapability events holders queue sda1 slaves ueventdev events_async inflight range sda2 statroot@ubuntu16:/home/comet/Costor/bin# cat /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/dev8:0root@ubuntu16:/home/comet/Costor/bin# ll /dev/sdabrw-rw---- 1 root disk 8, 0 Sep 19 11:36 /dev/sda
host: 主機(jī)適配器的唯一編號。
channel: 主機(jī)適配器中scsi通道編號,由主機(jī)適配器固件維護(hù)。
id: 目標(biāo)節(jié)點(diǎn)唯一標(biāo)識符。
lun: 目標(biāo)節(jié)點(diǎn)內(nèi)邏輯單元編號。
SCSI 命令是在 Command Descriptor Block (CDB) 中定義的。CDB 包含了用來定義要執(zhí)行的特定操作的操作代碼,以及大量特定于操作的參數(shù)。
命令 | 用途 |
---|---|
Test unit ready | 查詢設(shè)備是否已經(jīng)準(zhǔn)備好進(jìn)行傳輸 |
Inquiry | 請求設(shè)備基本信息 |
Request sense | 請求之前命令的錯(cuò)誤信息 |
Read capacity | 請求存儲容量信息 |
Read | 從設(shè)備讀取數(shù)據(jù) |
Write | 向設(shè)備寫入數(shù)據(jù) |
Mode sense | 請求模式頁面(設(shè)備參數(shù)) |
Mode select | 在模式頁面配置設(shè)備參數(shù) |
借助大約 60 種可用命令,SCSI 可適用于許多設(shè)備(包括隨機(jī)存取設(shè)備,比如磁盤和像磁帶這樣的順序存儲設(shè)備)。SCSI 也提供了專門的命令以訪問箱體服務(wù)(比如存儲箱體內(nèi)部當(dāng)前的傳感和溫度)。
主機(jī)適配器模板是相同型號主機(jī)適配器的公共內(nèi)容,包括請求隊(duì)列深度,SCSI命令處理回調(diào)函數(shù),錯(cuò)誤處理恢復(fù)函數(shù)。分配主機(jī)適配器結(jié)構(gòu)時(shí),需要使用主機(jī)適配器模板來賦值。在編寫SCSI低層驅(qū)動時(shí),第一步便是定義模板scsi_host_template,之后才能有模板生成主機(jī)適配器。
struct scsi_host_template { struct module *module; //指向使用該模板實(shí)現(xiàn)的scsi_host,低層驅(qū)動模塊。 const char *name; //主機(jī)適配器名稱 int (* detect)(struct scsi_host_template *); int (* release)(struct Scsi_Host *); const char *(* info)(struct Scsi_Host *); //返回HBA相關(guān)信息,可選實(shí)現(xiàn) int (* ioctl)(struct scsi_device *dev, int cmd, void __user *arg); //用戶空間ioctl函數(shù)的實(shí)現(xiàn),可選實(shí)現(xiàn)#ifdef CONFIG_COMPAT //通過該函數(shù),支持32位系統(tǒng)的用戶態(tài)ioctl函數(shù) int (* compat_ioctl)(struct scsi_device *dev, int cmd, void __user *arg);#endif //將scsi命令放進(jìn)低層驅(qū)動的隊(duì)列,由中間層調(diào)用,必須實(shí)現(xiàn) int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *); //以下5個(gè)函數(shù)是錯(cuò)誤處理回調(diào)函數(shù),由中間層按照嚴(yán)重程度調(diào)用 int (* eh_abort_handler)(struct scsi_cmnd *); //Abort int (* eh_device_reset_handler)(struct scsi_cmnd *); //Device Reset int (* eh_target_reset_handler)(struct scsi_cmnd *); //Target Reset int (* eh_bus_reset_handler)(struct scsi_cmnd *); //Bus Reset int (* eh_host_reset_handler)(struct scsi_cmnd *); //Host Reset //當(dāng)掃描到新磁盤時(shí)調(diào)用,中間層回調(diào)這個(gè)函數(shù)中可以分配和初始化低層驅(qū)動所需要的結(jié)構(gòu) int (* slave_alloc)(struct scsi_device *)//在設(shè)備受到INQUIRY命令后,執(zhí)行相關(guān)的配置操作 int (* slave_configure)(struct scsi_device *); //在scsi設(shè)備銷毀之前調(diào)用,中間層回調(diào)用于釋放slave_alloc分配的私有數(shù)據(jù) void (* slave_destroy)(struct scsi_device *); //當(dāng)發(fā)現(xiàn)新的target,中間層調(diào)用,用戶分配target私有數(shù)據(jù) int (* target_alloc)(struct scsi_target *); //在target被銷毀之前,中間層調(diào)用,低層驅(qū)動實(shí)現(xiàn),用于釋放target_alloc分配的數(shù)據(jù) void (* target_destroy)(struct scsi_target *); //需要自定義掃描target邏輯時(shí),中間層循環(huán)檢查返回值,直到該函數(shù)返回1,表示掃描完成 int (* scan_finished)(struct Scsi_Host *, unsigned long); //需要自定義掃描target邏輯時(shí),掃描開始前回調(diào) void (* scan_start)(struct Scsi_Host *); //改變主機(jī)適配器的隊(duì)列深度,返回設(shè)置的隊(duì)列深度 int (* change_queue_depth)(struct scsi_device *, int); //返回磁盤的BIOS參數(shù),如size, device, list (heads, sectors, cylinders) int (* bios_param)(struct scsi_device *, struct block_device *, sector_t, int []); void (*unlock_native_capacity)(struct scsi_device *); //在procfs中的讀寫操作回調(diào) int (*show_info)(struct seq_file *, struct Scsi_Host *); int (*write_info)(struct Scsi_Host *, char *, int); //中間層發(fā)現(xiàn)scsi命令超時(shí)回調(diào) enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *); //通過sysfs屬性reset主機(jī)適配器時(shí),回調(diào) int (*host_reset)(struct Scsi_Host *shost, int reset_type);#define SCSI_ADAPTER_RESET 1#define SCSI_FIRMWARE_RESET 2 const char *proc_name; //在proc文件系統(tǒng)的名稱 struct proc_dir_entry *proc_dir; int can_queue; //主機(jī)適配器能同時(shí)接受的命令數(shù) int this_id; /* * This determines the degree to which the host adapter is capable * of scatter-gather. */ //聚散列表的參數(shù) unsigned short sg_tablesize; unsigned short sg_prot_tablesize; /* * Set this if the host adapter has limitations beside segment count. */ //單個(gè)scsi命令能夠訪問的扇區(qū)最大數(shù)量 unsigned int max_sectors; /* * DMA scatter gather segment boundary limit. A segment crossing this * boundary will be split in two. */ unsigned long dma_boundary; //DMA聚散段邊界值,超過該值將被切割成兩個(gè)#define SCSI_DEFAULT_MAX_SECTORS 1024 short cmd_per_lun; /* * present contains counter indicating how many boards of this * type were found when we did the scan. */ unsigned char present; /* If use block layer to manage tags, this is tag allocation policy */ int tag_alloc_policy; /* * Track QUEUE_FULL events and reduce queue depth on demand. */ unsigned track_queue_depth:1; /* * This specifies the mode that a LLD supports. */ unsigned supported_mode:2; //低層驅(qū)動支持的模式(initiator或target) /* * True if this host adapter uses unchecked DMA onto an ISA bus. */ unsigned unchecked_isa_dma:1; unsigned use_clustering:1; /* * True for emulated SCSI host adapters (e.g. ATAPI). */ unsigned emulated:1; /* * True if the low-level driver performs its own reset-settle delays. */ unsigned skip_settle_delay:1; /* True if the controller does not support WRITE SAME */ unsigned no_write_same:1; /* * True if asynchronous aborts are not supported */ unsigned no_async_abort:1; /* * Countdown for host blocking with no commands outstanding. */ unsigned int max_host_blocked; //主機(jī)適配器發(fā)送隊(duì)列的低閥值,允許累計(jì)多個(gè)命令同時(shí)派發(fā)#define SCSI_DEFAULT_HOST_BLOCKED 7 /* * Pointer to the sysfs class properties for this host, NULL terminated. */ struct device_attribute **shost_attrs; //主機(jī)適配器類屬性 /* * Pointer to the SCSI device properties for this host, NULL terminated. */ struct device_attribute **sdev_attrs; //主機(jī)適配器設(shè)備屬性 struct list_head legacy_hosts; u64 vendor_id; /* * Additional per-command data allocated for the driver. */ //scsi 命令緩沖池,scsi命令都是預(yù)先分配好的,保存在cmd_pool中 unsigned int cmd_size; struct scsi_host_cmd_pool *cmd_pool; /* temporary flag to disable blk-mq I/O path */ bool disable_blk_mq; //禁用通用塊層多隊(duì)列模式標(biāo)志};
Scsi_Host描述一個(gè)SCSI主機(jī)適配器,SCSI主機(jī)適配器通常是一塊基于PCI總線的擴(kuò)展卡或是一個(gè)SCSI控制器芯片。每個(gè)SCSI主機(jī)適配器可以存在多個(gè)通道,一個(gè)通道實(shí)際擴(kuò)展了一條SCSI總線。每個(gè)通過可以連接多個(gè)SCSI目標(biāo)節(jié)點(diǎn),具體連接數(shù)量與SCSI總線帶載能力有關(guān),或者受具體SCSI協(xié)議的限制。 真實(shí)的主機(jī)總線適配器是接入主機(jī)IO總線上(通常是PCI總線),在系統(tǒng)啟動時(shí),會掃描掛載在PCI總線上的設(shè)備,此時(shí)會分配主機(jī)總線適配器。
Scsi_Host結(jié)構(gòu)包含內(nèi)嵌通用設(shè)備,將被鏈入SCSI總線類型(scsi_bus_type)的設(shè)備鏈表。
struct Scsi_Host { struct list_head __devices; //設(shè)備鏈表 struct list_head __targets; //目標(biāo)節(jié)點(diǎn)鏈表 struct scsi_host_cmd_pool *cmd_pool; //scsi命令緩沖池 spinlock_t free_list_lock; //保護(hù)free_list struct list_head free_list; /* backup store of cmd structs, scsi命令預(yù)先分配的備用命令鏈表 */ struct list_head starved_list; //scsi命令的饑餓鏈表 spinlock_t default_lock; spinlock_t *host_lock; struct mutex scan_mutex;/* serialize scanning activity */ struct list_head eh_cmd_q; //執(zhí)行錯(cuò)誤的scsi命令的鏈表 struct task_struct * ehandler; /* Error recovery thread. 錯(cuò)誤恢復(fù)線程 */ struct completion * eh_action; /* Wait for specific actions on the host. */ wait_queue_head_t host_wait; //scsi設(shè)備恢復(fù)等待隊(duì)列 struct scsi_host_template *hostt; //主機(jī)適配器模板 struct scsi_transport_template *transportt; //指向SCSI傳輸層模板 /* * Area to keep a shared tag map (if needed, will be * NULL if not). */ union { struct blk_queue_tag *bqt; struct blk_mq_tag_set tag_set; //SCSI支持多隊(duì)列時(shí)使用 }; //已經(jīng)派發(fā)給主機(jī)適配器(低層驅(qū)動)的scsi命令數(shù) atomic_t host_busy; /* commands actually active on low-level */ atomic_t host_blocked; //阻塞的scsi命令數(shù) unsigned int host_failed; /* commands that failed. protected by host_lock */ unsigned int host_eh_scheduled; /* EH scheduled without command */ unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. 系統(tǒng)內(nèi)唯一標(biāo)識 */ /* next two fields are used to bound the time spent in error handling */ int eh_deadline; unsigned long last_reset; //記錄上次reset時(shí)間 /* * These three parameters can be used to allow for wide scsi, * and for host adapters that support multiple busses * The last two should be set to 1 more than the actual max id * or lun (e.g. 8 for SCSI parallel systems). */ unsigned int max_channel; //主機(jī)適配器的最大通道編號 unsigned int max_id; //主機(jī)適配器目標(biāo)節(jié)點(diǎn)最大編號 u64 max_lun; //主機(jī)適配器lun最大編號 unsigned int unique_id; /* * The maximum length of SCSI commands that this host can accept. * Probably 12 for most host adapters, but could be 16 for others. * or 260 if the driver supports variable length cdbs. * For drivers that don't set this field, a value of 12 is * assumed. */ unsigned short max_cmd_len; //主機(jī)適配器可以接受的最長的SCSI命令 //下面這段在scsi_host_template中也有,由template中的字段賦值 int this_id; int can_queue; short cmd_per_lun; short unsigned int sg_tablesize; short unsigned int sg_prot_tablesize; unsigned int max_sectors; unsigned long dma_boundary; /* * In scsi-mq mode, the number of hardware queues supported by the LLD. * * Note: it is assumed that each hardware queue has a queue depth of * can_queue. In other words, the total queue depth per host * is nr_hw_queues * can_queue. */ unsigned nr_hw_queues; //在scsi-mq模式中,低層驅(qū)動所支持的硬件隊(duì)列的數(shù)量 /* * Used to assign serial numbers to the cmds. * Protected by the host lock. */ unsigned long cmd_serial_number; //指向命令序列號unsigned active_mode:2; //標(biāo)識是initiator或target unsigned unchecked_isa_dma:1; unsigned use_clustering:1; /* * Host has requested that no further requests come through for the * time being. */ unsigned host_self_blocked:1; //表示低層驅(qū)動要求阻塞該主機(jī)適配器,此時(shí)中間層不會繼續(xù)派發(fā)命令到主機(jī)適配器隊(duì)列中 /* * Host uses correct SCSI ordering not PC ordering. The bit is * set for the minority of drivers whose authors actually read * the spec ;). */ unsigned reverse_ordering:1; /* Task mgmt function in progress */ unsigned tmf_in_progress:1; //任務(wù)管理函數(shù)正在執(zhí)行 /* Asynchronous scan in progress */ unsigned async_scan:1; //異步掃描正在執(zhí)行 /* Don't resume host in EH */ unsigned eh_noresume:1; //在錯(cuò)誤處理過程不恢復(fù)主機(jī)適配器 /* The controller does not support WRITE SAME */ unsigned no_write_same:1; unsigned use_blk_mq:1; //是否使用SCSI多隊(duì)列模式 unsigned use_cmd_list:1; /* Host responded with short (<36 bytes) INQUIRY result */ unsigned short_inquiry:1; /* * Optional work queue to be utilized by the transport */ char work_q_name[20]; //被scsi傳輸層使用的工作隊(duì)列 struct workqueue_struct *work_q; /* * Task management function work queue */ struct workqueue_struct *tmf_work_q; //任務(wù)管理函數(shù)工作隊(duì)列 /* The transport requires the LUN bits NOT to be stored in CDB[1] */ unsigned no_scsi2_lun_in_cdb:1; /* * Value host_blocked counts down from */ unsigned int max_host_blocked; //在派發(fā)隊(duì)列中累計(jì)命令達(dá)到這個(gè)數(shù)值,才開始喚醒主機(jī)適配器 /* Protection Information */ unsigned int prot_capabilities; unsigned char prot_guard_type; /* * q used for scsi_tgt msgs, async events or any other requests that * need to be processed in userspace */ struct request_queue *uspace_req_q; //需要在用戶空間處理的scsi_tgt消息、異步事件或其他請求的請求隊(duì)列 /* legacy crap */ unsigned long base; unsigned long io_port; //I/O端口編號 unsigned char n_io_port; unsigned char dma_channel; unsigned int irq; enum scsi_host_state shost_state; //狀態(tài) /* ldm bits */ //shost_gendev: 內(nèi)嵌通用設(shè)備,SCSI設(shè)備通過這個(gè)域鏈入SCSI總線類型(scsi_bus_type)的設(shè)備鏈表 struct device shost_gendev, shost_dev; //shost_dev: 內(nèi)嵌類設(shè)備, SCSI設(shè)備通過這個(gè)域鏈入SCSI主機(jī)適配器類型(shost_class)的設(shè)備鏈表 /* * List of hosts per template. * * This is only for use by scsi_module.c for legacy templates. * For these access to it is synchronized implicitly by * module_init/module_exit. */ struct list_head sht_legacy_list; /* * Points to the transport data (if any) which is allocated * separately */ void *shost_data; //指向獨(dú)立分配的傳輸層數(shù)據(jù),由SCSI傳輸層使用 /* * Points to the physical bus device we'd use to do DMA * Needed just in case we have virtual hosts. */ struct device *dma_dev; /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force * alignment to a long boundary. */ //主機(jī)適配器專有數(shù)據(jù) unsigned long hostdata[0] /* Used for storage of host specific stuff */ __attribute__ ((aligned (sizeof(unsigned long))));};
scsi_target結(jié)構(gòu)中有一個(gè)內(nèi)嵌驅(qū)動模型設(shè)備,被鏈入SCSI總線類型scsi_bus_type的設(shè)備鏈表。
struct scsi_target { struct scsi_device *starget_sdev_user; //指向正在進(jìn)行I/O的scsi設(shè)備,沒有IO則指向NULL struct list_head siblings; //鏈入主機(jī)適配器target鏈表中 struct list_head devices; //屬于該target的device鏈表 struct device dev; //通用設(shè)備,用于加入設(shè)備驅(qū)動模型 struct kref reap_ref; /* last put renders target invisible 本結(jié)構(gòu)的引用計(jì)數(shù) */ unsigned int channel; //該target所在的channel號 unsigned int id; /* target id ... replace * scsi_device.id eventually */ unsigned int create:1; /* signal that it needs to be added */ unsigned int single_lun:1; /* Indicates we should only * allow I/O to one of the luns * for the device at a time. */ unsigned int pdt_1f_for_no_lun:1; /* PDT = 0x1f * means no lun present. */ unsigned int no_report_luns:1; /* Don't use * REPORT LUNS for scanning. */ unsigned int expecting_lun_change:1; /* A device has reported * a 3F/0E UA, other devices on * the same target will also. */ /* commands actually active on LLD. */ atomic_t target_busy; atomic_t target_blocked; //當(dāng)前阻塞的命令數(shù) /* * LLDs should set this in the slave_alloc host template callout. * If set to zero then there is not limit. */ unsigned int can_queue; //同時(shí)處理的命令數(shù) unsigned int max_target_blocked; //阻塞命令數(shù)閥值#define SCSI_DEFAULT_TARGET_BLOCKED 3 char scsi_level; //支持的SCSI規(guī)范級別 enum scsi_target_state state; //target狀態(tài) void *hostdata; /* available to low-level driver */ unsigned long starget_data[0]; /* for the transport SCSI傳輸層(中間層)使用 */ /* starget_data must be the last element!!!! */} __attribute__((aligned(sizeof(unsigned long))));
scsi_device描述scsi邏輯設(shè)備,代表scsi磁盤的邏輯單元lun。scsi_device描述符所代表的設(shè)備可能是另一臺存儲設(shè)備上的SATA/SAS/SCSI磁盤或SSD。操作系統(tǒng)在掃描到連接在主機(jī)適配器上的邏輯設(shè)備時(shí),創(chuàng)建scsi_device結(jié)構(gòu),用于scsi高層驅(qū)動和該設(shè)備通信。
struct scsi_device { struct Scsi_Host *host; //所歸屬的主機(jī)總線適配器 struct request_queue *request_queue; //請求隊(duì)列 /* the next two are protected by the host->host_lock */ struct list_head siblings; /* list of all devices on this host */ //鏈入主機(jī)總線適配器設(shè)備鏈表 struct list_head same_target_siblings; /* just the devices sharing same target id */ //鏈入target的設(shè)備鏈表 atomic_t device_busy; /* commands actually active on LLDD */ atomic_t device_blocked; /* Device returned QUEUE_FULL. */ spinlock_t list_lock; struct list_head cmd_list; /* queue of in use SCSI Command structures */ struct list_head starved_entry; //鏈入主機(jī)適配器的"饑餓"鏈表 struct scsi_cmnd *current_cmnd; /* currently active command */ //當(dāng)前正在執(zhí)行的命令 unsigned short queue_depth; /* How deep of a queue we want */ unsigned short max_queue_depth; /* max queue depth */ unsigned short last_queue_full_depth; /* These two are used by */ unsigned short last_queue_full_count; /* scsi_track_queue_full() */ unsigned long last_queue_full_time; /* last queue full time */ unsigned long queue_ramp_up_period; /* ramp up period in jiffies */#define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ) unsigned long last_queue_ramp_up; /* last queue ramp up time */ unsigned int id, channel; //scsi_device所屬的target id和所在channel通道號 u64 lun; //該設(shè)備的lun編號 unsigned int manufacturer; /* Manufacturer of device, for using 制造商 * vendor-specific cmd's */ unsigned sector_size; /* size in bytes 硬件的扇區(qū)大小 */ void *hostdata; /* available to low-level driver 專有數(shù)據(jù) */ char type; //SCSI設(shè)備類型 char scsi_level; //所支持SCSI規(guī)范的版本號,由INQUIRY命令獲得 char inq_periph_qual; /* PQ from INQUIRY data */ unsigned char inquiry_len; /* valid bytes in 'inquiry' */ unsigned char * inquiry; /* INQUIRY response data */ const char * vendor; /* [back_compat] point into 'inquiry' ... */ const char * model; /* ... after scan; point to static string */ const char * rev; /* ... "nullnullnullnull" before scan */#define SCSI_VPD_PG_LEN 255 int vpd_pg83_len; //sense命令 0x83 unsigned char *vpd_pg83; int vpd_pg80_len; //sense命令 0x80 unsigned char *vpd_pg80; unsigned char current_tag; /* current tag */ struct scsi_target *sdev_target; /* used only for single_lun */ unsigned int sdev_bflags; /* black/white flags as also found in * scsi_devinfo.[hc]. For now used only to * pass settings from slave_alloc to scsi * core. */ unsigned int eh_timeout; /* Error handling timeout */ unsigned removable:1; unsigned changed:1; /* Data invalid due to media change */ unsigned busy:1; /* Used to prevent races */ unsigned lockable:1; /* Able to prevent media removal */ unsigned locked:1; /* Media removal disabled */ unsigned borken:1; /* Tell the Seagate driver to be * painfully slow on this device */ unsigned disconnect:1; /* can disconnect */ unsigned soft_reset:1; /* Uses soft reset option */ unsigned sdtr:1; /* Device supports SDTR messages 支持同步數(shù)據(jù)傳輸 */ unsigned wdtr:1; /* Device supports WDTR messages 支持16位寬數(shù)據(jù)傳輸*/ unsigned ppr:1; /* Device supports PPR messages 支持PPR(并行協(xié)議請求)消息*/ unsigned tagged_supported:1; /* Supports SCSI-II tagged queuing */ unsigned simple_tags:1; /* simple queue tag messages are enabled */ unsigned was_reset:1; /* There was a bus reset on the bus for * this device */ unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN * because we did a bus reset. */ unsigned use_10_for_rw:1; /* first try 10-byte read / write */ unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */ unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */ unsigned no_write_same:1; /* no WRITE SAME command */ unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */ unsigned skip_ms_page_8:1; /* do not use MODE SENSE page 0x08 */ unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */ unsigned skip_vpd_pages:1; /* do not read VPD pages */ unsigned try_vpd_pages:1; /* attempt to read VPD pages */ unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */ unsigned no_start_on_add:1; /* do not issue start on add */ unsigned allow_restart:1; /* issue START_UNIT in error handler */ unsigned manage_start_stop:1; /* Let HLD (sd) manage start/stop */ unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */ unsigned no_uld_attach:1; /* disable connecting to upper level drivers */ unsigned select_no_atn:1; unsigned fix_capacity:1; /* READ_CAPACITY is too high by 1 */ unsigned guess_capacity:1; /* READ_CAPACITY might be too high by 1 */ unsigned retry_hwerror:1; /* Retry HARDWARE_ERROR */ unsigned last_sector_bug:1; /* do not use multisector accesses on SD_LAST_BUGGY_SECTORS */ unsigned no_read_disc_info:1; /* Avoid READ_DISC_INFO cmds */ unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */ unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */ unsigned is_visible:1; /* is the device visible in sysfs */ unsigned wce_default_on:1; /* Cache is ON by default */ unsigned no_dif:1; /* T10 PI (DIF) should be disabled */ unsigned broken_fua:1; /* Don't set FUA bit */ unsigned lun_in_cdb:1; /* Store LUN bits in CDB[1] */ atomic_t disk_events_disable_depth; /* disable depth for disk events */ DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */ DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */ struct list_head event_list; /* asserted events */ struct work_struct event_work; unsigned int max_device_blocked; /* what device_blocked counts down from */#define SCSI_DEFAULT_DEVICE_BLOCKED 3 atomic_t iorequest_cnt; atomic_t iodone_cnt; atomic_t ioerr_cnt; struct device sdev_gendev, //內(nèi)嵌通用設(shè)備, 鏈入scsi總線類型(scsi_bus_type)的設(shè)備鏈表 sdev_dev; //內(nèi)嵌類設(shè)備,鏈入scsi設(shè)備類(sdev_class)的設(shè)備鏈表 struct execute_work ew; /* used to get process context on put */ struct work_struct requeue_work; struct scsi_device_handler *handler; //自定義設(shè)備處理函數(shù) void *handler_data; enum scsi_device_state sdev_state; //scsi設(shè)備狀態(tài) unsigned long sdev_data[0]; //scsi傳輸層使用} __attribute__((aligned(sizeof(unsigned long))));
scsi_cmnd結(jié)構(gòu)有SCSI中間層創(chuàng)建,傳遞到SCSI低層驅(qū)動。每個(gè)IO請求會被創(chuàng)建一個(gè)scsi_cnmd,但scsi_cmnd并不一定是時(shí)IO請求。scsi_cmnd最終轉(zhuǎn)化成一個(gè)具體的SCSI命令。除了命令描述塊之外,scsi_cmnd包含更豐富的信息,包括數(shù)據(jù)緩沖區(qū)、感測數(shù)據(jù)緩沖區(qū)、完成回調(diào)函數(shù)以及所關(guān)聯(lián)的塊設(shè)備驅(qū)動層請求等,是SCSI中間層執(zhí)行SCSI命令的上下文。
struct scsi_cmnd { struct scsi_device *device; //指向命令所屬SCSI設(shè)備的描述符的指針 struct list_head list; /* scsi_cmnd participates in queue lists 鏈入scsi設(shè)備的命令鏈表 */ struct list_head eh_entry; /* entry for the host eh_cmd_q */ struct delayed_work abort_work; int eh_eflags; /* Used by error handlr */ /* * A SCSI Command is assigned a nonzero serial_number before passed * to the driver's queue command function. The serial_number is * cleared when scsi_done is entered indicating that the command * has been completed. It is a bug for LLDDs to use this number * for purposes other than printk (and even that is only useful * for debugging). */ unsigned long serial_number; //scsi命令的唯一序號 /* * This is set to jiffies as it was when the command was first * allocated. It is used to time how long the command has * been outstanding */ unsigned long jiffies_at_alloc; //分配時(shí)的jiffies, 用于計(jì)算命令處理時(shí)間 int retries; //命令重試次數(shù) int allowed; //允許的重試次數(shù) unsigned char prot_op; //保護(hù)操作(DIF和DIX) unsigned char prot_type; //DIF保護(hù)類型 unsigned char prot_flags; unsigned short cmd_len; //命令長度 enum dma_data_direction sc_data_direction; //命令傳輸方向 /* These elements define the operation we are about to perform */ unsigned char *cmnd; //scsi規(guī)范格式的命令字符串 /* These elements define the operation we ultimately want to perform */ struct scsi_data_buffer sdb; //scsi命令數(shù)據(jù)緩沖區(qū) struct scsi_data_buffer *prot_sdb; //scsi命令保護(hù)信息緩沖區(qū) unsigned underflow; /* Return error if less than this amount is transferred */ unsigned transfersize; /* How much we are guaranteed to //傳輸單位 transfer with each SCSI transfer (ie, between disconnect / reconnects. Probably == sector size */ struct request *request; /* The command we are 通用塊層的請求描述符 working on */#define SCSI_SENSE_BUFFERSIZE 96 unsigned char *sense_buffer; //scsi命令感測數(shù)據(jù)緩沖區(qū) /* obtained by REQUEST SENSE when * CHECK CONDITION is received on original * command (auto-sense) */ /* Low-level done function - can be used by low-level driver to point * to completion function. Not used by mid/upper level code. */ void (*scsi_done) (struct scsi_cmnd *); //scsi命令在低層驅(qū)動完成時(shí),回調(diào) /* * The following fields can be written to by the host specific code. * Everything else should be left alone. */ struct scsi_pointer SCp; /* Scratchpad used by some host adapters */ unsigned char *host_scribble; /* The host adapter is allowed to * call scsi_malloc and get some memory * and hang it here. The host adapter * is also expected to call scsi_free * to release this memory. (The memory * obtained by scsi_malloc is guaranteed * to be at an address < 16Mb). */ int result; /* Status code from lower level driver */ int flags; /* Command flags */ unsigned char tag; /* SCSI-II queued command tag */};
struct scsi_driver { struct device_driver gendrv; // "繼承"device_driver void (*rescan)(struct device *); //重新掃描前調(diào)用的回調(diào)函數(shù) int (*init_command)(struct scsi_cmnd *); void (*uninit_command)(struct scsi_cmnd *); int (*done)(struct scsi_cmnd *); //當(dāng)?shù)蛯域?qū)動完成一個(gè)scsi命令時(shí)調(diào)用,用于計(jì)算已經(jīng)完成的字節(jié)數(shù) int (*eh_action)(struct scsi_cmnd *, int); //錯(cuò)誤處理回調(diào)};
scsi_bus_type: scsi子系統(tǒng)總線類型
struct bus_type scsi_bus_type = { .name = "scsi", // 對應(yīng)/sys/bus/scsi .match = scsi_bus_match, .uevent = scsi_bus_uevent,#ifdef CONFIG_PM .pm = &scsi_bus_pm_ops,#endif};EXPORT_SYMBOL_GPL(scsi_bus_type);
shost_class: scsi子系統(tǒng)類
static struct class shost_class = { .name = "scsi_host", // 對應(yīng)/sys/class/scsi_host .dev_release = scsi_host_cls_release,};
操作系統(tǒng)啟動時(shí),會加載scsi子系統(tǒng),入口函數(shù)是init_scsi,使用subsys_initcall定義:
static int __init init_scsi(void){ int error; error = scsi_init_queue(); //初始化聚散列表所需要的存儲池 if (error) return error; error = scsi_init_procfs(); //初始化procfs中與scsi相關(guān)的目錄項(xiàng) if (error) goto cleanup_queue; error = scsi_init_devinfo();//設(shè)置scsi動態(tài)設(shè)備信息列表 if (error) goto cleanup_procfs; error = scsi_init_hosts(); //注冊shost_class類,在/sys/class/目錄下創(chuàng)建scsi_host子目錄 if (error) goto cleanup_devlist; error = scsi_init_sysctl(); //注冊SCSI系統(tǒng)控制表 if (error) goto cleanup_hosts; error = scsi_sysfs_register(); //注冊scsi_bus_type總線類型和sdev_class類 if (error) goto cleanup_sysctl; scsi_netlink_init(); //初始化SCSI傳輸netlink接口 printk(KERN_NOTICE "SCSI subsystem initialized\n"); return 0;cleanup_sysctl: scsi_exit_sysctl();cleanup_hosts: scsi_exit_hosts();cleanup_devlist: scsi_exit_devinfo();cleanup_procfs: scsi_exit_procfs();cleanup_queue: scsi_exit_queue(); printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n", -error); return error;}
scsi_init_hosts函數(shù)初始化scsi子系統(tǒng)主機(jī)適配器所屬的類shost_class:
int scsi_init_hosts(void){ return class_register(&shost_class);}
scsi_sysfs_register函數(shù)初始化scsi子系統(tǒng)總線類型scsi_bus_type和設(shè)備所屬的類sdev_class類:
int scsi_sysfs_register(void){ int error; error = bus_register(&scsi_bus_type); if (!error) { error = class_register(&sdev_class); if (error) bus_unregister(&scsi_bus_type); } return error;}
scsi低層驅(qū)動是面向主機(jī)適配器的,低層驅(qū)動被加載時(shí),需要添加主機(jī)適配器。主機(jī)適配器添加有兩種方式:1.在PCI子系統(tǒng)掃描掛載驅(qū)動時(shí)添加;2.手動方式添加。所有基于硬件PCI接口的主機(jī)適配器都采用第一種方式。添加主機(jī)適配器包括兩個(gè)步驟:
1. 分別主機(jī)適配器數(shù)據(jù)結(jié)構(gòu)scsi_host_alloc
2. 將主機(jī)適配器添加到系統(tǒng)scsi_add_host
struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize){ struct Scsi_Host *shost; gfp_t gfp_mask = GFP_KERNEL; if (sht->unchecked_isa_dma && privsize) gfp_mask |= __GFP_DMA; //一次分配Scsi_Host和私有數(shù)據(jù)空間 shost = kzalloc(sizeof(struct Scsi_Host) + privsize, gfp_mask); if (!shost) return NULL; shost->host_lock = &shost->default_lock; spin_lock_init(shost->host_lock); shost->shost_state = SHOST_CREATED; //更新狀態(tài) INIT_LIST_HEAD(&shost->__devices); //初始化scsi設(shè)備鏈表 INIT_LIST_HEAD(&shost->__targets); //初始化target鏈表 INIT_LIST_HEAD(&shost->eh_cmd_q); //初始化執(zhí)行錯(cuò)誤的scsi命令鏈表 INIT_LIST_HEAD(&shost->starved_list); //初始化scsi命令饑餓鏈表 init_waitqueue_head(&shost->host_wait); mutex_init(&shost->scan_mutex); /* * subtract one because we increment first then return, but we need to * know what the next host number was before increment */ //遞增分配主機(jī)適配器號 shost->host_no = atomic_inc_return(&scsi_host_next_hn) - 1; shost->dma_channel = 0xff; /* These three are default values which can be overridden */ shost->max_channel = 0; //默認(rèn)通道號為0 shost->max_id = 8; //默認(rèn)target最大數(shù)量 shost->max_lun = 8; //默認(rèn)scsi_device最大數(shù)量 /* Give each shost a default transportt */ shost->transportt = &blank_transport_template; //scsi傳輸層(中間層)模板 /* * All drivers right now should be able to handle 12 byte * commands. Every so often there are requests for 16 byte * commands, but individual low-level drivers need to certify that * they actually do something sensible with such commands. */ shost->max_cmd_len = 12; //最長的SCSI命令長度 shost->hostt = sht; //使用主機(jī)適配器模板 shost->this_id = sht->this_id; shost->can_queue = sht->can_queue; shost->sg_tablesize = sht->sg_tablesize; shost->sg_prot_tablesize = sht->sg_prot_tablesize; shost->cmd_per_lun = sht->cmd_per_lun; shost->unchecked_isa_dma = sht->unchecked_isa_dma; shost->use_clustering = sht->use_clustering; shost->no_write_same = sht->no_write_same; if (shost_eh_deadline == -1 || !sht->eh_host_reset_handler) shost->eh_deadline = -1; else if ((ulong) shost_eh_deadline * HZ > INT_MAX) { shost_printk(KERN_WARNING, shost, "eh_deadline %u too large, setting to %u\n", shost_eh_deadline, INT_MAX / HZ); shost->eh_deadline = INT_MAX; } else shost->eh_deadline = shost_eh_deadline * HZ; if (sht->supported_mode == MODE_UNKNOWN) //由模板指定HBA的模式 /* means we didn't set it ... default to INITIATOR */ shost->active_mode = MODE_INITIATOR; //主機(jī)適配器模式默認(rèn)是initiator else shost->active_mode = sht->supported_mode; if (sht->max_host_blocked) shost->max_host_blocked = sht->max_host_blocked; else shost->max_host_blocked = SCSI_DEFAULT_HOST_BLOCKED; /* * If the driver imposes no hard sector transfer limit, start at * machine infinity initially. */ if (sht->max_sectors) shost->max_sectors = sht->max_sectors; else shost->max_sectors = SCSI_DEFAULT_MAX_SECTORS; /* * assume a 4GB boundary, if not set */ if (sht->dma_boundary) shost->dma_boundary = sht->dma_boundary; else shost->dma_boundary = 0xffffffff; //默認(rèn)DMA的邊界為4G shost->use_blk_mq = scsi_use_blk_mq && !shost->hostt->disable_blk_mq; device_initialize(&shost->shost_gendev); //初始化主機(jī)適配器內(nèi)部通用設(shè)備 dev_set_name(&shost->shost_gendev, "host%d", shost->host_no); shost->shost_gendev.bus = &scsi_bus_type; //設(shè)置主機(jī)適配器的總線類型 shost->shost_gendev.type = &scsi_host_type; //設(shè)置主機(jī)適配器的設(shè)備類型 device_initialize(&shost->shost_dev); //初始化主機(jī)適配器的內(nèi)部類設(shè)備 shost->shost_dev.parent = &shost->shost_gendev; //內(nèi)部類設(shè)備的父設(shè)備設(shè)置為其內(nèi)部通用設(shè)備 shost->shost_dev.class = &shost_class; //設(shè)置內(nèi)部類設(shè)備所屬的類是shost_class dev_set_name(&shost->shost_dev, "host%d", shost->host_no); shost->shost_dev.groups = scsi_sysfs_shost_attr_groups; //設(shè)置類設(shè)備的屬性組 shost->ehandler = kthread_run(scsi_error_handler, shost, //啟動主機(jī)適配器的錯(cuò)誤恢復(fù)內(nèi)核線程 "scsi_eh_%d", shost->host_no); if (IS_ERR(shost->ehandler)) { shost_printk(KERN_WARNING, shost, "error handler thread failed to spawn, error = %ld\n", PTR_ERR(shost->ehandler)); goto fail_kfree; } //分配任務(wù)管理工作隊(duì)列 shost->tmf_work_q = alloc_workqueue("scsi_tmf_%d", WQ_UNBOUND | WQ_MEM_RECLAIM, 1, shost->host_no); if (!shost->tmf_work_q) { shost_printk(KERN_WARNING, shost, "failed to create tmf workq\n"); goto fail_kthread; } scsi_proc_hostdir_add(shost->hostt); //在procfs中添加主機(jī)適配器的目錄, eg. //創(chuàng)建/proc/scsi/<主機(jī)適配器名稱>目錄 return shost; fail_kthread: kthread_stop(shost->ehandler); fail_kfree: kfree(shost); return NULL;}EXPORT_SYMBOL(scsi_host_alloc);
static inline int __must_check scsi_add_host(struct Scsi_Host *host, struct device *dev) //dev為父設(shè)備{ return scsi_add_host_with_dma(host, dev, dev);}int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, struct device *dma_dev){ struct scsi_host_template *sht = shost->hostt; int error = -EINVAL; shost_printk(KERN_INFO, shost, "%s\n", sht->info ? sht->info(shost) : sht->name); if (!shost->can_queue) { shost_printk(KERN_ERR, shost, "can_queue = 0 no longer supported\n"); goto fail; } if (shost_use_blk_mq(shost)) { //如果主機(jī)適配器設(shè)置使用多隊(duì)列IO,則建立 error = scsi_mq_setup_tags(shost); //相應(yīng)的多隊(duì)列環(huán)境 if (error) goto fail; } else { shost->bqt = blk_init_tags(shost->can_queue, shost->hostt->tag_alloc_policy); if (!shost->bqt) { error = -ENOMEM; goto fail; } } /* * Note that we allocate the freelist even for the MQ case for now, * as we need a command set aside for scsi_reset_provider. Having * the full host freelist and one command available for that is a * little heavy-handed, but avoids introducing a special allocator * just for this. Eventually the structure of scsi_reset_provider * will need a major overhaul. */ //分配存儲scsi命令和sense數(shù)據(jù)的緩沖區(qū), 并分配scsi命令的備用倉庫鏈表 error = scsi_setup_command_freelist(shost); if (error) goto out_destroy_tags; //設(shè)置主機(jī)適配器的父設(shè)備,確定該設(shè)備在sysfs中的位置,通常會通過dev參數(shù)傳入pci_dev。 if (!shost->shost_gendev.parent) shost->shost_gendev.parent = dev ? dev : &platform_bus; //如果dev為NULL,設(shè)置為platform_bus if (!dma_dev) dma_dev = shost->shost_gendev.parent; shost->dma_dev = dma_dev; error = device_add(&shost->shost_gendev); //添加主機(jī)適配器通用設(shè)備到系統(tǒng) if (error) goto out_destroy_freelist; pm_runtime_set_active(&shost->shost_gendev); pm_runtime_enable(&shost->shost_gendev); device_enable_async_suspend(&shost->shost_gendev); //支持異步掛起通用設(shè)備 scsi_host_set_state(shost, SHOST_RUNNING); //設(shè)置主機(jī)適配器狀態(tài) get_device(shost->shost_gendev.parent); //增加通用父設(shè)備的引用計(jì)數(shù) device_enable_async_suspend(&shost->shost_dev); //支持異步掛起類設(shè)備 error = device_add(&shost->shost_dev); //添加主機(jī)適配器類設(shè)備到系統(tǒng) if (error) goto out_del_gendev; get_device(&shost->shost_gendev); if (shost->transportt->host_size) { //scsi傳輸層使用的數(shù)據(jù)空間 shost->shost_data = kzalloc(shost->transportt->host_size, GFP_KERNEL); if (shost->shost_data == NULL) { error = -ENOMEM; goto out_del_dev; } } if (shost->transportt->create_work_queue) { snprintf(shost->work_q_name, sizeof(shost->work_q_name), "scsi_wq_%d", shost->host_no); shost->work_q = create_singlethread_workqueue( //分配被scsi傳輸層使用的工作隊(duì)列 shost->work_q_name); if (!shost->work_q) { error = -EINVAL; goto out_free_shost_data; } } error = scsi_sysfs_add_host(shost); //添加主機(jī)適配器到子系統(tǒng) if (error) goto out_destroy_host; scsi_proc_host_add(shost); //在procfs添加主機(jī)適配器信息 return error; out_destroy_host: if (shost->work_q) destroy_workqueue(shost->work_q); out_free_shost_data: kfree(shost->shost_data); out_del_dev: device_del(&shost->shost_dev); out_del_gendev: device_del(&shost->shost_gendev); out_destroy_freelist: scsi_destroy_command_freelist(shost); out_destroy_tags: if (shost_use_blk_mq(shost)) scsi_mq_destroy_tags(shost); fail: return error;}EXPORT_SYMBOL(scsi_add_host_with_dma);
在系統(tǒng)啟動過程中,會掃描默認(rèn)的PCI根總線,從而觸發(fā)了PCI設(shè)備掃描的過程,開始構(gòu)造PCI設(shè)備樹,SCSI主機(jī)適配器是掛載在PCI總線的設(shè)備。SCSI主機(jī)適配器做PCI設(shè)備會被PCI總線驅(qū)動層掃描到(PCI設(shè)備的掃描采用配置空間訪問的方式),掃描到SCSI主機(jī)適配器后,操作系統(tǒng)開始加載SCSI主機(jī)適配器驅(qū)動,SCSI主機(jī)適配器驅(qū)動就是上面所說的低層驅(qū)動。SCSI主機(jī)適配器驅(qū)動根據(jù)SCSI主機(jī)適配器驅(qū)動根據(jù)SCSI主機(jī)適配模板分配SCSI主機(jī)適配器描述符,并添加到系統(tǒng),之后啟動通過SCSI主機(jī)適配器擴(kuò)展出來的下一級總線–SCSI總線的掃描過程。
SCSI中間層依次以可能的ID和LUN構(gòu)造INQUIRY命令,之后將這些INQUIRY命令提交給塊IO子系統(tǒng),后者又最終將調(diào)用SCSI中間層的策略例程,再次提取到SCSI命令結(jié)構(gòu)后,調(diào)用SCSI低層驅(qū)動的queuecommand回調(diào)函數(shù)實(shí)現(xiàn)。
對于給定ID的目標(biāo)節(jié)點(diǎn),如果它在SCSI總線上存在,那么它一定要實(shí)現(xiàn)對LUN0的INQUIRY響應(yīng)。也就是說,如果向某個(gè)ID的目標(biāo)節(jié)點(diǎn)的LUN0發(fā)送INQUIRY命令,或依次向各個(gè)LUN嘗試發(fā)送INQUIRY命令,檢查是否能收到響應(yīng),最終SCSI中間層能夠得到SCSI域中的所連接的邏輯設(shè)備及其信息。
SCSI總線具體的掃描方式可以由具體的主機(jī)適配器固件、主機(jī)適配器驅(qū)動實(shí)現(xiàn),在此只討論由主機(jī)適配器驅(qū)動調(diào)用scsi中間層提供通用的掃描函數(shù)的實(shí)現(xiàn)方式scsi_scan_host。
void scsi_scan_host(struct Scsi_Host *shost){ struct async_scan_data *data; if (strncmp(scsi_scan_type, "none", 4) == 0) //檢查掃描邏輯 return; if (scsi_autopm_get_host(shost) < 0) return; data = scsi_prep_async_scan(shost); //準(zhǔn)備異步掃描 if (!data) { do_scsi_scan_host(shost); //同步掃描 scsi_autopm_put_host(shost); return; } /* register with the async subsystem so wait_for_device_probe() * will flush this work */ async_schedule(do_scan_async, data); //異步掃描 /* scsi_autopm_put_host(shost) is called in scsi_finish_async_scan() */}EXPORT_SYMBOL(scsi_scan_host);
scsi_scan_host函數(shù)是scsi中間層提供的主機(jī)適配器掃描函數(shù),對于有主機(jī)適配器驅(qū)動有自定義掃描邏輯需求的可以設(shè)置主機(jī)適配器模板的回調(diào)函數(shù),由scsi_scan_host函數(shù)來調(diào)用回調(diào)實(shí)現(xiàn)自定義掃描。
scsi_scan_type變量指定了掃描方式:async、sync、none。無論最終掃描方式是同步還是異步,都是由do_scsi_scan_host函數(shù)實(shí)現(xiàn):
static void do_scsi_scan_host(struct Scsi_Host *shost){ if (shost->hostt->scan_finished) { //使用自定義掃描方式 unsigned long start = jiffies; if (shost->hostt->scan_start) shost->hostt->scan_start(shost); //自定義掃描開始回調(diào) while (!shost->hostt->scan_finished(shost, jiffies - start)) //自定義掃描完成時(shí)返回1 msleep(10); } else { //scsi子系統(tǒng)通用掃描函數(shù), SCAN_WILD_CARD表示掃描所有的target和device scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD, SCAN_WILD_CARD, 0); }}
如果主機(jī)適配器模板設(shè)置了自定義掃描函數(shù),do_scsi_scan_host函數(shù)將會調(diào)用。如果沒有設(shè)置則使用默認(rèn)的掃描函數(shù)scsi_scan_host_selected執(zhí)行掃描。
int scsi_scan_host_selected(struct Scsi_Host *shost, unsigned int channel, unsigned int id, u64 lun, int rescan){ SCSI_LOG_SCAN_BUS(3, shost_printk (KERN_INFO, shost, "%s: <%u:%u:%llu>\n", __func__, channel, id, lun)); //檢查channel、id、lun是否有效 if (((channel != SCAN_WILD_CARD) && (channel > shost->max_channel)) || ((id != SCAN_WILD_CARD) && (id >= shost->max_id)) || ((lun != SCAN_WILD_CARD) && (lun >= shost->max_lun))) return -EINVAL; mutex_lock(&shost->scan_mutex); if (!shost->async_scan) scsi_complete_async_scans(); //檢查Scsi_Host的狀態(tài)是否允許掃描 if (scsi_host_scan_allowed(shost) && scsi_autopm_get_host(shost) == 0) { if (channel == SCAN_WILD_CARD) for (channel = 0; channel <= shost->max_channel; //遍歷所有的channel進(jìn)行掃描 channel++) scsi_scan_channel(shost, channel, id, lun, //掃描channel rescan); else scsi_scan_channel(shost, channel, id, lun, rescan); //掃描指定的channel scsi_autopm_put_host(shost); } mutex_unlock(&shost->scan_mutex); return 0;}
scsi_scan_host_selected函數(shù)掃描指定的主機(jī)適配器,根據(jù)輸入的參數(shù)決定是否遍歷掃描所有channel或掃描指定channel,通過函數(shù)scsi_scan_channel完成。
static void scsi_scan_channel(struct Scsi_Host *shost, unsigned int channel, unsigned int id, u64 lun, int rescan){ uint order_id; if (id == SCAN_WILD_CARD) for (id = 0; id < shost->max_id; ++id) { //遍歷所有的target /* * XXX adapter drivers when possible (FCP, iSCSI) * could modify max_id to match the current max, * not the absolute max. * * XXX add a shost id iterator, so for example, * the FC ID can be the same as a target id * without a huge overhead of sparse id's. */ if (shost->reverse_ordering) /* * Scan from high to low id. */ order_id = shost->max_id - id - 1; else order_id = id; __scsi_scan_target(&shost->shost_gendev, channel, //掃描指定的target order_id, lun, rescan); } else __scsi_scan_target(&shost->shost_gendev, channel, id, lun, rescan);}
__scsi_scan_target函數(shù)指定掃描target內(nèi)部的lun。
static void __scsi_scan_target(struct device *parent, unsigned int channel, unsigned int id, u64 lun, int rescan){ struct Scsi_Host *shost = dev_to_shost(parent); int bflags = 0; int res; struct scsi_target *starget; if (shost->this_id == id) /* * Don't scan the host adapter */ return; //為指定的id分配target數(shù)據(jù)結(jié)構(gòu),并初始化 starget = scsi_alloc_target(parent, channel, id); if (!starget) return; scsi_autopm_get_target(starget); if (lun != SCAN_WILD_CARD) { /* * Scan for a specific host/chan/id/lun. */ //掃描target中指定id的scsi_device(lun),并將scsi_device(lun)添加到子系統(tǒng) scsi_probe_and_add_lun(starget, lun, NULL, NULL, rescan, NULL); goto out_reap; } /* * Scan LUN 0, if there is some response, scan further. Ideally, we * would not configure LUN 0 until all LUNs are scanned. */ //探測target的LUN0 res = scsi_probe_and_add_lun(starget, 0, &bflags, NULL, rescan, NULL); if (res == SCSI_SCAN_LUN_PRESENT || res == SCSI_SCAN_TARGET_PRESENT) { if (scsi_report_lun_scan(starget, bflags, rescan) != 0) //向target lun 0發(fā)送REPORT_LUNS /* * The REPORT LUN did not scan the target, * do a sequential scan. */ scsi_sequential_lun_scan(starget, bflags, //探測REPORT_LUNS上報(bào)的lun starget->scsi_level, rescan); } out_reap: scsi_autopm_put_target(starget); /* * paired with scsi_alloc_target(): determine if the target has * any children at all and if not, nuke it */ scsi_target_reap(starget); put_device(&starget->dev);}
掃描到target時(shí)分配并初始化scsi_target結(jié)構(gòu),scsi_probe_and_add_lun函數(shù)完成探測target中的lun,并將發(fā)現(xiàn)的lun添加到系統(tǒng)。
static int scsi_probe_and_add_lun(struct scsi_target *starget, u64 lun, int *bflagsp, struct scsi_device **sdevp, int rescan, void *hostdata){ struct scsi_device *sdev; unsigned char *result; int bflags, res = SCSI_SCAN_NO_RESPONSE, result_len = 256; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); /* * The rescan flag is used as an optimization, the first scan of a * host adapter calls into here with rescan == 0. */ sdev = scsi_device_lookup_by_target(starget, lun); //尋找target中指定id的lun if (sdev) { //target中已經(jīng)存在lun if (rescan || !scsi_device_created(sdev)) { //rescan參數(shù)要求重新掃描該lun SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev, "scsi scan: device exists on %s\n", dev_name(&sdev->sdev_gendev))); if (sdevp) *sdevp = sdev; else scsi_device_put(sdev); if (bflagsp) *bflagsp = scsi_get_device_flags(sdev, sdev->vendor, sdev->model); return SCSI_SCAN_LUN_PRESENT; } scsi_device_put(sdev); } else sdev = scsi_alloc_sdev(starget, lun, hostdata); //target中不存在lun,分配scsi_device if (!sdev) goto out; result = kmalloc(result_len, GFP_ATOMIC | ((shost->unchecked_isa_dma) ? __GFP_DMA : 0)); if (!result) goto out_free_sdev; if (scsi_probe_lun(sdev, result, result_len, &bflags)) //發(fā)送INQUIRY到具體device,進(jìn)行探測 goto out_free_result; if (bflagsp) *bflagsp = bflags; /* * result contains valid SCSI INQUIRY data. */ if (((result[0] >> 5) == 3) && !(bflags & BLIST_ATTACH_PQ3)) { /* * For a Peripheral qualifier 3 (011b), the SCSI * spec says: The device server is not capable of * supporting a physical device on this logical * unit. * * For disks, this implies that there is no * logical disk configured at sdev->lun, but there * is a target id responding. */ SCSI_LOG_SCAN_BUS(2, sdev_printk(KERN_INFO, sdev, "scsi scan:" " peripheral qualifier of 3, device not" " added\n")) if (lun == 0) { SCSI_LOG_SCAN_BUS(1, { unsigned char vend[9]; unsigned char mod[17]; sdev_printk(KERN_INFO, sdev, "scsi scan: consider passing scsi_mod." "dev_flags=%s:%s:0x240 or 0x1000240\n", scsi_inq_str(vend, result, 8, 16), scsi_inq_str(mod, result, 16, 32)); }); } res = SCSI_SCAN_TARGET_PRESENT; goto out_free_result; } /* * Some targets may set slight variations of PQ and PDT to signal * that no LUN is present, so don't add sdev in these cases. * Two specific examples are: * 1) NetApp targets: return PQ=1, PDT=0x1f * 2) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved" * in the UFI 1.0 spec (we cannot rely on reserved bits). * * References: * 1) SCSI SPC-3, pp. 145-146 * PQ=1: "A peripheral device having the specified peripheral * device type is not connected to this logical unit. However, the * device server is capable of supporting the specified peripheral * device type on this logical unit." * PDT=0x1f: "Unknown or no device type" * 2) USB UFI 1.0, p. 20 * PDT=00h Direct-access device (floppy) * PDT=1Fh none (no FDD connected to the requested logical unit) */ if (((result[0] >> 5) == 1 || starget->pdt_1f_for_no_lun) && (result[0] & 0x1f) == 0x1f && !scsi_is_wlun(lun)) { SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev, "scsi scan: peripheral device type" " of 31, no device added\n")); res = SCSI_SCAN_TARGET_PRESENT; goto out_free_result; } //添加scsi設(shè)備到子系統(tǒng) res = scsi_add_lun(sdev, result, &bflags, shost->async_scan); if (res == SCSI_SCAN_LUN_PRESENT) { if (bflags & BLIST_KEY) { sdev->lockable = 0; scsi_unlock_floptical(sdev, result); } } out_free_result: kfree(result); out_free_sdev: if (res == SCSI_SCAN_LUN_PRESENT) { if (sdevp) { if (scsi_device_get(sdev) == 0) { *sdevp = sdev; } else { __scsi_remove_device(sdev); res = SCSI_SCAN_NO_RESPONSE; } } } else __scsi_remove_device(sdev); out: return res;}
scsi_probe_and_add_lun函數(shù)由名字可知,完成lun的probe和add兩個(gè)操作:
1. 探測邏輯設(shè)備scsi_probe_lun,發(fā)送INQUIRY命令到具體設(shè)備。
2. 添加邏輯設(shè)備到系統(tǒng)scsi_add_lun,根據(jù)INQUIRY命令返回值添加lun到系統(tǒng)。
static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result, int result_len, int *bflags){ unsigned char scsi_cmd[MAX_COMMAND_SIZE]; int first_inquiry_len, try_inquiry_len, next_inquiry_len; int response_len = 0; int pass, count, result; struct scsi_sense_hdr sshdr; *bflags = 0; /* Perform up to 3 passes. The first pass uses a conservative * transfer length of 36 unless sdev->inquiry_len specifies a * different value. */ first_inquiry_len = sdev->inquiry_len ? sdev->inquiry_len : 36; try_inquiry_len = first_inquiry_len; pass = 1; next_pass: SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev, "scsi scan: INQUIRY pass %d length %d\n", pass, try_inquiry_len)); /* Each pass gets up to three chances to ignore Unit Attention */ for (count = 0; count < 3; ++count) { int resid; memset(scsi_cmd, 0, 6); scsi_cmd[0] = INQUIRY; //命令類型是INQUIRY scsi_cmd[4] = (unsigned char) try_inquiry_len; memset(inq_result, 0, try_inquiry_len); //發(fā)送SCSI命令,重試3次 result = scsi_execute_req(sdev, scsi_cmd, DMA_FROM_DEVICE, inq_result, try_inquiry_len, &sshdr, HZ / 2 + HZ * scsi_inq_timeout, 3, &resid); SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev, "scsi scan: INQUIRY %s with code 0x%x\n", result ? "failed" : "successful", result)); if (result) { /* * not-ready to ready transition [asc/ascq=0x28/0x0] * or power-on, reset [asc/ascq=0x29/0x0], continue. * INQUIRY should not yield UNIT_ATTENTION * but many buggy devices do so anyway. */ if ((driver_byte(result) & DRIVER_SENSE) && scsi_sense_valid(&sshdr)) { if ((sshdr.sense_key == UNIT_ATTENTION) && ((sshdr.asc == 0x28) || (sshdr.asc == 0x29)) && (sshdr.ascq == 0)) continue; } } else { /* * if nothing was transferred, we try * again. It's a workaround for some USB * devices. */ if (resid == try_inquiry_len) continue; } break; } if (result == 0) { sanitize_inquiry_string(&inq_result[8], 8); sanitize_inquiry_string(&inq_result[16], 16); sanitize_inquiry_string(&inq_result[32], 4); response_len = inq_result[4] + 5; if (response_len > 255) response_len = first_inquiry_len; /* sanity */ /* * Get any flags for this device. * * XXX add a bflags to scsi_device, and replace the * corresponding bit fields in scsi_device, so bflags * need not be passed as an argument. */ *bflags = scsi_get_device_flags(sdev, &inq_result[8], &inq_result[16]); /* When the first pass succeeds we gain information about * what larger transfer lengths might work. */ if (pass == 1) { if (BLIST_INQUIRY_36 & *bflags) next_inquiry_len = 36; else if (BLIST_INQUIRY_58 & *bflags) next_inquiry_len = 58; else if (sdev->inquiry_len) next_inquiry_len = sdev->inquiry_len; else next_inquiry_len = response_len; /* If more data is available perform the second pass */ if (next_inquiry_len > try_inquiry_len) { try_inquiry_len = next_inquiry_len; pass = 2; goto next_pass; } } } else if (pass == 2) { sdev_printk(KERN_INFO, sdev, "scsi scan: %d byte inquiry failed. " "Consider BLIST_INQUIRY_36 for this device\n", try_inquiry_len); /* If this pass failed, the third pass goes back and transfers * the same amount as we successfully got in the first pass. */ try_inquiry_len = first_inquiry_len; pass = 3; goto next_pass; } /* If the last transfer attempt got an error, assume the * peripheral doesn't exist or is dead. */ if (result) return -EIO; /* Don't report any more data than the device says is valid */ sdev->inquiry_len = min(try_inquiry_len, response_len); /* * XXX Abort if the response length is less than 36? If less than * 32, the lookup of the device flags (above) could be invalid, * and it would be possible to take an incorrect action - we do * not want to hang because of a short INQUIRY. On the flip side, * if the device is spun down or becoming ready (and so it gives a * short INQUIRY), an abort here prevents any further use of the * device, including spin up. * * On the whole, the best approach seems to be to assume the first * 36 bytes are valid no matter what the device says. That's * better than copying < 36 bytes to the inquiry-result buffer * and displaying garbage for the Vendor, Product, or Revision * strings. */ if (sdev->inquiry_len < 36) { if (!sdev->host->short_inquiry) { shost_printk(KERN_INFO, sdev->host, "scsi scan: INQUIRY result too short (%d)," " using 36\n", sdev->inquiry_len); sdev->host->short_inquiry = 1; } sdev->inquiry_len = 36; } /* * Related to the above issue: * * XXX Devices (disk or all?) should be sent a TEST UNIT READY, * and if not ready, sent a START_STOP to start (maybe spin up) and * then send the INQUIRY again, since the INQUIRY can change after * a device is initialized. * * Ideally, start a device if explicitly asked to do so. This * assumes that a device is spun up on power on, spun down on * request, and then spun up on request. */ /* * The scanning code needs to know the scsi_level, even if no * device is attached at LUN 0 (SCSI_SCAN_TARGET_PRESENT) so * non-zero LUNs can be scanned. */ sdev->scsi_level = inq_result[2] & 0x07; if (sdev->scsi_level >= 2 || (sdev->scsi_level == 1 && (inq_result[3] & 0x0f) == 1)) sdev->scsi_level++; sdev->sdev_target->scsi_level = sdev->scsi_level; /* * If SCSI-2 or lower, and if the transport requires it, * store the LUN value in CDB[1]. */ sdev->lun_in_cdb = 0; if (sdev->scsi_level <= SCSI_2 && sdev->scsi_level != SCSI_UNKNOWN && !sdev->host->no_scsi2_lun_in_cdb) sdev->lun_in_cdb = 1; return 0;}static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result, int *bflags, int async){ int ret; /* * XXX do not save the inquiry, since it can change underneath us, * save just vendor/model/rev. * * Rather than save it and have an ioctl that retrieves the saved * value, have an ioctl that executes the same INQUIRY code used * in scsi_probe_lun, let user level programs doing INQUIRY * scanning run at their own risk, or supply a user level program * that can correctly scan. */ /* * Copy at least 36 bytes of INQUIRY data, so that we don't * dereference unallocated memory when accessing the Vendor, * Product, and Revision strings. Badly behaved devices may set * the INQUIRY Additional Length byte to a small value, indicating * these strings are invalid, but often they contain plausible data * nonetheless. It doesn't matter if the device sent < 36 bytes * total, since scsi_probe_lun() initializes inq_result with 0s. */ sdev->inquiry = kmemdup(inq_result, max_t(size_t, sdev->inquiry_len, 36), GFP_ATOMIC); if (sdev->inquiry == NULL) return SCSI_SCAN_NO_RESPONSE; sdev->vendor = (char *) (sdev->inquiry + 8); //第8個(gè)字節(jié)到第15個(gè)字節(jié)是vendor identification sdev->model = (char *) (sdev->inquiry + 16); //第16個(gè)字節(jié)到第31個(gè)字節(jié)是product identification sdev->rev = (char *) (sdev->inquiry + 32); //第32個(gè)字節(jié)到第35個(gè)字節(jié)是product revision level if (strncmp(sdev->vendor, "ATA ", 8) == 0) { /* * sata emulation layer device. This is a hack to work around * the SATL power management specifications which state that * when the SATL detects the device has gone into standby * mode, it shall respond with NOT READY. */ sdev->allow_restart = 1; } if (*bflags & BLIST_ISROM) { sdev->type = TYPE_ROM; sdev->removable = 1; } else { sdev->type = (inq_result[0] & 0x1f); sdev->removable = (inq_result[1] & 0x80) >> 7; /* * some devices may respond with wrong type for * well-known logical units. Force well-known type * to enumerate them correctly. */ if (scsi_is_wlun(sdev->lun) && sdev->type != TYPE_WLUN) { sdev_printk(KERN_WARNING, sdev, "%s: correcting incorrect peripheral device type 0x%x for W-LUN 0x%16xhN\n", __func__, sdev->type, (unsigned int)sdev->lun); sdev->type = TYPE_WLUN; } } if (sdev->type == TYPE_RBC || sdev->type == TYPE_ROM) { /* RBC and MMC devices can return SCSI-3 compliance and yet * still not support REPORT LUNS, so make them act as * BLIST_NOREPORTLUN unless BLIST_REPORTLUN2 is * specifically set */ if ((*bflags & BLIST_REPORTLUN2) == 0) *bflags |= BLIST_NOREPORTLUN; } /* * For a peripheral qualifier (PQ) value of 1 (001b), the SCSI * spec says: The device server is capable of supporting the * specified peripheral device type on this logical unit. However, * the physical device is not currently connected to this logical * unit. * * The above is vague, as it implies that we could treat 001 and * 011 the same. Stay compatible with previous code, and create a * scsi_device for a PQ of 1 * * Don't set the device offline here; rather let the upper * level drivers eval the PQ to decide whether they should * attach. So remove ((inq_result[0] >> 5) & 7) == 1 check. */ sdev->inq_periph_qual = (inq_result[0] >> 5) & 7; sdev->lockable = sdev->removable; sdev->soft_reset = (inq_result[7] & 1) && ((inq_result[3] & 7) == 2); if (sdev->scsi_level >= SCSI_3 || (sdev->inquiry_len > 56 && inq_result[56] & 0x04)) sdev->ppr = 1; if (inq_result[7] & 0x60) sdev->wdtr = 1; if (inq_result[7] & 0x10) sdev->sdtr = 1; sdev_printk(KERN_NOTICE, sdev, "%s %.8s %.16s %.4s PQ: %d " "ANSI: %d%s\n", scsi_device_type(sdev->type), sdev->vendor, sdev->model, sdev->rev, sdev->inq_periph_qual, inq_result[2] & 0x07, (inq_result[3] & 0x0f) == 1 ? " CCS" : ""); if ((sdev->scsi_level >= SCSI_2) && (inq_result[7] & 2) && !(*bflags & BLIST_NOTQ)) { sdev->tagged_supported = 1; sdev->simple_tags = 1; } /* * Some devices (Texel CD ROM drives) have handshaking problems * when used with the Seagate controllers. borken is initialized * to 1, and then set it to 0 here. */ if ((*bflags & BLIST_BORKEN) == 0) sdev->borken = 0; if (*bflags & BLIST_NO_ULD_ATTACH) sdev->no_uld_attach = 1; /* * Apparently some really broken devices (contrary to the SCSI * standards) need to be selected without asserting ATN */ if (*bflags & BLIST_SELECT_NO_ATN) sdev->select_no_atn = 1; /* * Maximum 512 sector transfer length * broken RA4x00 Compaq Disk Array */ if (*bflags & BLIST_MAX_512) blk_queue_max_hw_sectors(sdev->request_queue, 512); /* * Max 1024 sector transfer length for targets that report incorrect * max/optimal lengths and relied on the old block layer safe default */ else if (*bflags & BLIST_MAX_1024) blk_queue_max_hw_sectors(sdev->request_queue, 1024); /* * Some devices may not want to have a start command automatically * issued when a device is added. */ if (*bflags & BLIST_NOSTARTONADD) sdev->no_start_on_add = 1; if (*bflags & BLIST_SINGLELUN) scsi_target(sdev)->single_lun = 1; sdev->use_10_for_rw = 1; if (*bflags & BLIST_MS_SKIP_PAGE_08) sdev->skip_ms_page_8 = 1; if (*bflags & BLIST_MS_SKIP_PAGE_3F) sdev->skip_ms_page_3f = 1; if (*bflags & BLIST_USE_10_BYTE_MS) sdev->use_10_for_ms = 1; /* some devices don't like REPORT SUPPORTED OPERATION CODES * and will simply timeout causing sd_mod init to take a very * very long time */ if (*bflags & BLIST_NO_RSOC) sdev->no_report_opcodes = 1; /* set the device running here so that slave configure * may do I/O */ ret = scsi_device_set_state(sdev, SDEV_RUNNING); //狀態(tài) if (ret) { ret = scsi_device_set_state(sdev, SDEV_BLOCK); if (ret) { sdev_printk(KERN_ERR, sdev, "in wrong state %s to complete scan\n", scsi_device_state_name(sdev->sdev_state)); return SCSI_SCAN_NO_RESPONSE; } } if (*bflags & BLIST_MS_192_BYTES_FOR_3F) sdev->use_192_bytes_for_3f = 1; if (*bflags & BLIST_NOT_LOCKABLE) sdev->lockable = 0; if (*bflags & BLIST_RETRY_HWERROR) sdev->retry_hwerror = 1; if (*bflags & BLIST_NO_DIF) sdev->no_dif = 1; sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT; if (*bflags & BLIST_TRY_VPD_PAGES) sdev->try_vpd_pages = 1; else if (*bflags & BLIST_SKIP_VPD_PAGES) sdev->skip_vpd_pages = 1; transport_configure_device(&sdev->sdev_gendev); //把lun配置到scsi傳輸層 if (sdev->host->hostt->slave_configure) { ret = sdev->host->hostt->slave_configure(sdev); //主機(jī)適配器模板設(shè)置的回調(diào),對scsi_device(lun)執(zhí)行特定的初始化 if (ret) { /* * if LLDD reports slave not present, don't clutter * console with alloc failure messages */ if (ret != -ENXIO) { sdev_printk(KERN_ERR, sdev, "failed to configure device\n"); } return SCSI_SCAN_NO_RESPONSE; } } if (sdev->scsi_level >= SCSI_3) scsi_attach_vpd(sdev); sdev->max_queue_depth = sdev->queue_depth; //設(shè)置最大隊(duì)列深度 /* * Ok, the device is now all set up, we can * register it and tell the rest of the kernel * about it. */ //添加scsi_device(lun)到sysfs if (!async && scsi_sysfs_add_sdev(sdev) != 0) return SCSI_SCAN_NO_RESPONSE; return SCSI_SCAN_LUN_PRESENT;}