国产一级a片免费看高清,亚洲熟女中文字幕在线视频,黄三级高清在线播放,免费黄色视频在线看

打開APP
userphoto
未登錄

開通VIP,暢享免費(fèi)電子書等14項(xiàng)超值服

開通VIP
Linux存儲IO棧
本系列文章將自底向上分析Linux存儲IO棧源碼(基于4.4.19),為學(xué)習(xí)Linux存儲做記錄。具體目錄如下:
一、 Linux內(nèi)核對象與對象集二、 sysfs三、 設(shè)備模型四、 SCSI子系統(tǒng)

五、 SCSI磁盤驅(qū)動sd

六、 SCSI Target--TCM

七、 用戶空間IO--UIO

八、 在用戶空間實(shí)現(xiàn)虛擬SCSI磁盤--TCMU

九、 通用塊層

十、文件系統(tǒng)--VFS


Linux內(nèi)核對象和對象集

內(nèi)核對象作為Linux設(shè)備驅(qū)動模型的基礎(chǔ),主要是抽象和封裝總線、設(shè)備、驅(qū)動、類和接口之間的關(guān)系具體實(shí)現(xiàn)的相關(guān)代碼,并在sysfs中呈現(xiàn)。主要抽象成kobject和kset結(jié)構(gòu):

struct kobject {    const char      *name;   //在sysfs中顯示的名稱    struct list_head    entry;   //鏈入kset的kobj鏈表    struct kobject      *parent; //指向父kobject,用于表示樹形結(jié)構(gòu)    struct kset     *kset;   //指向鏈入的kset    struct kobj_type    *ktype;  //抽象kobject的通用方法和屬性    struct kernfs_node  *sd;     //sysfs directory entry     struct kref     kref;    //引用計(jì)數(shù) #ifdef CONFIG_DEBUG_KOBJECT_RELEASE    struct delayed_work release; #endif    unsigned int state_initialized:1;  //是否被初始化    unsigned int state_in_sysfs:1;     //是否被添加到sysfs    unsigned int state_add_uevent_sent:1; //是否發(fā)送ADD事件到用戶空間    unsigned int state_remove_uevent_sent:1; //是否發(fā)送REMOVE事件到用戶空間    unsigned int uevent_suppress:1; //事件是否被抑制};

在kobject結(jié)構(gòu)中ktype域是對kobject一些通用方法和屬性進(jìn)行封裝:

struct kobj_type {    void (*release)(struct kobject *kobj); //釋放kobject結(jié)構(gòu)時(shí)回調(diào)    const struct sysfs_ops *sysfs_ops; //sysfs的操作函數(shù)    struct attribute **default_attrs;  //默認(rèn)屬性        //命名空間相關(guān)操作    const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);    const void *(*namespace)(struct kobject *kobj);};

kset是一組kobject的集合,通過kset可以遍歷這組kobject,如SCSI子系統(tǒng)中,設(shè)備是一種kobject,通過設(shè)備集kset,可以遍歷所有的設(shè)備。

/** * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem. * * A kset defines a group of kobjects.  They can be individually * different "types" but overall these kobjects all want to be grouped * together and operated on in the same manner.  ksets are used to * define the attribute callbacks and other common events that happen to * a kobject. * * @list: the list of all kobjects for this kset * @list_lock: a lock for iterating over the kobjects * @kobj: the embedded kobject for this kset (recursion, isn't it fun...) * @uevent_ops: the set of uevent operations for this kset.  These are * called whenever a kobject has something happen to it so that the kset * can add new environment variables, or filter out the uevents if so * desired. */struct kset {    struct list_head list; //鏈入kset的kobject鏈表    spinlock_t list_lock;  //遍歷鏈表是的自旋鎖struct kobject kobj;   //本身可以當(dāng)做kobject對待    const struct kset_uevent_ops *uevent_ops; //發(fā)送uevent事件的回調(diào)函數(shù)};

在發(fā)送事件到用戶空間時(shí),可以回調(diào)kset_uevent_ops中的3個(gè)回調(diào)函數(shù)

struct kset_uevent_ops {    int (* const filter)(struct kset *kset, struct kobject *kobj);    const char *(* const name)(struct kset *kset, struct kobject *kobj);    int (* const uevent)(struct kset *kset, struct kobject *kobj,              struct kobj_uevent_env *env);};
  • filter:在發(fā)送事件之前的過濾某些事件。

  • name: 獲取名稱。

  • uevent:設(shè)置uevent需要的環(huán)境變量。

內(nèi)核對象相關(guān)操作

void kobject_init(struct kobject *kobj, struct kobj_type *ktype);int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...);int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...);void kobject_del(struct kobject *kobj);struct kobject *  kobject_create(void);struct kobject * kobject_create_and_add(const char *name, struct kobject *parent);int kobject_rename(struct kobject *, const char *new_name);int kobject_move(struct kobject *, struct kobject *);struct kobject *kobject_get(struct kobject *kobj);void kobject_put(struct kobject *kobj);const void *kobject_namespace(struct kobject *kobj);char *kobject_get_path(struct kobject *kobj, gfp_t flag);

內(nèi)核對象創(chuàng)建及初始化

初始化流程主要在kobject_init:

/** * kobject_init - initialize a kobject structure * @kobj: pointer to the kobject to initialize * @ktype: pointer to the ktype for this kobject. * * This function will properly initialize a kobject such that it can then * be passed to the kobject_add() call. * * After this function is called, the kobject MUST be cleaned up by a call * to kobject_put(), not by a call to kfree directly to ensure that all of * the memory is cleaned up properly. */void kobject_init(struct kobject *kobj, struct kobj_type *ktype){    char *err_str;    if (!kobj) {        err_str = "invalid kobject pointer!";        goto error;    }    if (!ktype) {        err_str = "must have a ktype to be initialized properly!\n";        goto error;    }    if (kobj->state_initialized) {  //避免重復(fù)初始化        /* do not error out as sometimes we can recover */        printk(KERN_ERR "kobject (%p): tried to init an initialized "               "object, something is seriously wrong.\n", kobj);        dump_stack();    }    kobject_init_internal(kobj); //完成初始化的主要函數(shù)    kobj->ktype = ktype;    return;error:    printk(KERN_ERR "kobject (%p): %s\n", kobj, err_str);    dump_stack();}EXPORT_SYMBOL(kobject_init);

由上面函數(shù)可以看出由kobject_init_internal完成初始化:

static void kobject_init_internal(struct kobject *kobj){    if (!kobj)        return;    kref_init(&kobj->kref);    INIT_LIST_HEAD(&kobj->entry);    kobj->state_in_sysfs = 0;    kobj->state_add_uevent_sent = 0;    kobj->state_remove_uevent_sent = 0;    kobj->state_initialized = 1;}

kobject_create函數(shù)僅僅是在調(diào)用kobject_init之前,先分配kobject空間。在kobject初始化之后,需要調(diào)用kobject_add將kobject添加到sysfs中。

/** * kobject_add - the main kobject add function * @kobj: the kobject to add * @parent: pointer to the parent of the kobject. * @fmt: format to name the kobject with. * * The kobject name is set and added to the kobject hierarchy in this * function. * * If @parent is set, then the parent of the @kobj will be set to it. * If @parent is NULL, then the parent of the @kobj will be set to the * kobject associated with the kset assigned to this kobject.  If no kset * is assigned to the kobject, then the kobject will be located in the * root of the sysfs tree. * * If this function returns an error, kobject_put() must be called to * properly clean up the memory associated with the object. * Under no instance should the kobject that is passed to this function * be directly freed with a call to kfree(), that can leak memory. * * Note, no "add" uevent will be created with this call, the caller should set * up all of the necessary sysfs files for the object and then call * kobject_uevent() with the UEVENT_ADD parameter to ensure that * userspace is properly notified of this kobject's creation. */int kobject_add(struct kobject *kobj, struct kobject *parent,        const char *fmt, ...){    va_list args;    int retval;    if (!kobj)        return -EINVAL;    if (!kobj->state_initialized) { //add之前需要初始化        printk(KERN_ERR "kobject '%s' (%p): tried to add an "               "uninitialized object, something is seriously wrong.\n",               kobject_name(kobj), kobj);        dump_stack();        return -EINVAL;    }    va_start(args, fmt);    retval = kobject_add_varg(kobj, parent, fmt, args); //主要完成add操作    va_end(args);    return retval;}EXPORT_SYMBOL(kobject_add);

kobject_add_varg/kobject_add_internal主要完成將kobject添加到sysfs的操作:

static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,                       struct kobject *parent,                       const char *fmt, va_list vargs){    int retval;        //設(shè)置kobject在sysfs中顯示的名稱    retval = kobject_set_name_vargs(kobj, fmt, vargs);    if (retval) {        printk(KERN_ERR "kobject: can not set name properly!\n");        return retval;    }    kobj->parent = parent;    return kobject_add_internal(kobj); //主要實(shí)現(xiàn)函數(shù)}static int kobject_add_internal(struct kobject *kobj){    int error = 0;    struct kobject *parent;    if (!kobj)        return -ENOENT;    if (!kobj->name || !kobj->name[0]) {        WARN(1, "kobject: (%p): attempted to be registered with empty "             "name!\n", kobj);        return -EINVAL;    }    parent = kobject_get(kobj->parent); //增加父對象的引用計(jì)數(shù)    /* join kset if set, use it as parent if we do not already have one */    if (kobj->kset) { //如果設(shè)置了kset,而沒有設(shè)置parent,則把kset的kobject設(shè)置為parent        if (!parent)            parent = kobject_get(&kobj->kset->kobj);        kobj_kset_join(kobj);        kobj->parent = parent;    }    pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n",         kobject_name(kobj), kobj, __func__,         parent ? kobject_name(parent) : "<NULL>",         kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");    error = create_dir(kobj);  //創(chuàng)建sysfs對應(yīng)的目錄和屬性文件    if (error) {  //出錯(cuò)回滾        kobj_kset_leave(kobj);        kobject_put(parent);        kobj->parent = NULL;        /* be noisy on error issues */        if (error == -EEXIST)            WARN(1, "%s failed for %s with "                 "-EEXIST, don't try to register things with "                 "the same name in the same directory.\n",                 __func__, kobject_name(kobj));        else            WARN(1, "%s failed for %s (error: %d parent: %s)\n",                 __func__, kobject_name(kobj), error,                 parent ? kobject_name(parent) : "'none'");    } else        kobj->state_in_sysfs = 1; //更新標(biāo)志位    return error;}

由create_dir在sysfs創(chuàng)建真實(shí)的目錄和文件,這點(diǎn)有下一篇sysfs詳細(xì)描述。理解了kobject_init和kobject_add之后,由名字可以知道下面函數(shù)kobject_init_and_add和kobject_create_and_add

內(nèi)核對象釋放

調(diào)用kobject_del將對kobject釋放:

/** * kobject_del - unlink kobject from hierarchy. * @kobj: object. */void kobject_del(struct kobject *kobj){    struct kernfs_node *sd;    if (!kobj)        return;    sd = kobj->sd;    sysfs_remove_dir(kobj); //刪除kobject在sysfs中的目錄    sysfs_put(sd);    kobj->state_in_sysfs = 0; //設(shè)置標(biāo)志位    kobj_kset_leave(kobj);  //kobject脫離kset鏈表    kobject_put(kobj->parent); //調(diào)用kobject_release釋放    kobj->parent = NULL;}EXPORT_SYMBOL(kobject_del);/** * kobject_put - decrement refcount for object. * @kobj: object. * * Decrement the refcount, and if 0, call kobject_cleanup(). */void kobject_put(struct kobject *kobj){    if (kobj) {        if (!kobj->state_initialized)            WARN(1, KERN_WARNING "kobject: '%s' (%p): is not "                   "initialized, yet kobject_put() is being "                   "called.\n", kobject_name(kobj), kobj);        kref_put(&kobj->kref, kobject_release);  //調(diào)用kobject_release    }}EXPORT_SYMBOL(kobject_put);static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)){    return kref_sub(kref, 1, release);}static inline int kref_sub(struct kref *kref, unsigned int count,         void (*release)(struct kref *kref)){    WARN_ON(release == NULL);    if (atomic_sub_and_test((int) count, &kref->refcount)) {        release(kref); //調(diào)用kobject_release        return 1;    }    return 0;}

根據(jù)上面的代碼追蹤,得知kobject_release才是釋放kobject的主角:

static void kobject_release(struct kref *kref){    struct kobject *kobj = container_of(kref, struct kobject, kref);#ifdef CONFIG_DEBUG_KOBJECT_RELEASE    unsigned long delay = HZ + HZ * (get_random_int() & 0x3);    pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n",         kobject_name(kobj), kobj, __func__, kobj->parent, delay);    INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);    //延遲調(diào)用kobject_delayed_cleanup進(jìn)行清理    schedule_delayed_work(&kobj->release, delay);#else    kobject_cleanup(kobj);  //清理#endif}

如果在內(nèi)核編譯時(shí)指定CONFIG_DEBUG_KOBJECT_RELEASE,則使用延遲release方式調(diào)用kobject_delayed_cleanup,否則直接調(diào)用kobject_cleanup。

#ifdef CONFIG_DEBUG_KOBJECT_RELEASEstatic void kobject_delayed_cleanup(struct work_struct *work){    kobject_cleanup(container_of(to_delayed_work(work), //最終還是調(diào)用                     struct kobject, release));}#endif/* * kobject_cleanup - free kobject resources. * @kobj: object to cleanup */static void kobject_cleanup(struct kobject *kobj){    struct kobj_type *t = get_ktype(kobj);    const char *name = kobj->name;    pr_debug("kobject: '%s' (%p): %s, parent %p\n",         kobject_name(kobj), kobj, __func__, kobj->parent);    if (t && !t->release)        pr_debug("kobject: '%s' (%p): does not have a release() "             "function, it is broken and must be fixed.\n",             kobject_name(kobj), kobj);    /* send "remove" if the caller did not do it but sent "add" */    if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {        pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n",             kobject_name(kobj), kobj);        kobject_uevent(kobj, KOBJ_REMOVE); //僅僅發(fā)送一次REMOVE消息    }    /* remove from sysfs if the caller did not do it */    if (kobj->state_in_sysfs) {        pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n",             kobject_name(kobj), kobj);        kobject_del(kobj); //如果調(diào)用者沒有清理sysfs,則清理    }    if (t && t->release) {        pr_debug("kobject: '%s' (%p): calling ktype release\n",             kobject_name(kobj), kobj);        t->release(kobj); //調(diào)用kobj_type的release回調(diào)函數(shù)    }    /* free name if we allocated it */    if (name) {        pr_debug("kobject: '%s': free name\n", name);        kfree_const(name);    }}

內(nèi)核對象集相關(guān)操作

void kset_init(struct kset *kset);struct kset *kset_create(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj);int kset_register(struct kset *kset);void kset_unregister(struct kset *kset);struct kset * kset_create_and_add(const char *name, const struct kset_uevent_ops *u, struct kobject *parent_kobj);

內(nèi)核對象集創(chuàng)建及初始化

內(nèi)核對象集由kset_create創(chuàng)建

/** * kset_create - create a struct kset dynamically * * @name: the name for the kset * @uevent_ops: a struct kset_uevent_ops for the kset * @parent_kobj: the parent kobject of this kset, if any. * * This function creates a kset structure dynamically.  This structure can * then be registered with the system and show up in sysfs with a call to * kset_register().  When you are finished with this structure, if * kset_register() has been called, call kset_unregister() and the * structure will be dynamically freed when it is no longer being used. * * If the kset was not able to be created, NULL will be returned. */static struct kset *kset_create(const char *name,                const struct kset_uevent_ops *uevent_ops,                struct kobject *parent_kobj){    struct kset *kset;    int retval;    kset = kzalloc(sizeof(*kset), GFP_KERNEL);  //分配空間    if (!kset)        return NULL;    retval = kobject_set_name(&kset->kobj, "%s", name); //設(shè)置kset在sysfs中的名字    if (retval) {        kfree(kset);        return NULL;    }    kset->uevent_ops = uevent_ops;   //設(shè)置uevent_ops    kset->kobj.parent = parent_kobj; //設(shè)置kset的父對象    /*     * The kobject of this kset will have a type of kset_ktype and belong to     * no kset itself.  That way we can properly free it when it is     * finished being used.     */    kset->kobj.ktype = &kset_ktype;  //設(shè)置kobj_type    kset->kobj.kset = NULL;    return kset;}

內(nèi)核對象集由kset_init執(zhí)行初始化:

/** * kset_init - initialize a kset for use * @k: kset */void kset_init(struct kset *k){    kobject_init_internal(&k->kobj);  //這里初始化    INIT_LIST_HEAD(&k->list);    spin_lock_init(&k->list_lock);}static void kobject_init_internal(struct kobject *kobj){    if (!kobj)        return;    kref_init(&kobj->kref);    INIT_LIST_HEAD(&kobj->entry);    kobj->state_in_sysfs = 0;        //設(shè)置對應(yīng)標(biāo)志位    kobj->state_add_uevent_sent = 0;    kobj->state_remove_uevent_sent = 0;    kobj->state_initialized = 1;}

初始化kset之后,調(diào)用kset_register,將kset添加到sysfs:

/** * kset_register - initialize and add a kset. * @k: kset. */int kset_register(struct kset *k){    int err;    if (!k)        return -EINVAL;    kset_init(k);    err = kobject_add_internal(&k->kobj); //完成register動作,前面已說明    if (err)        return err;    kobject_uevent(&k->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間    return 0;}EXPORT_SYMBOL(kset_register);

經(jīng)過kset_create, kset_init和kset_register之后,kset已初始化并添加完成。當(dāng)然kset_create_and_add包含了這三個(gè)函數(shù)。

內(nèi)核對象集釋放

內(nèi)核對象的釋放過程與kobject的釋放過程類似,由kset_unregister完成:

/** * kset_unregister - remove a kset. * @k: kset. */void kset_unregister(struct kset *k){    if (!k)        return;    kobject_del(&k->kobj);  //刪除sysfs的目錄和屬性文件,前面已說明    kobject_put(&k->kobj);  //與kobject釋放過程一致}EXPORT_SYMBOL(kset_unregister);

發(fā)送事件到用戶空間

由前面的代碼可以看到無論kobject或是kset,都會向用戶空間發(fā)送事件,由kobject_uevent函數(shù)通過設(shè)置環(huán)境變量的方式完成:

struct kobj_uevent_env {    char *argv[3];                //user_helper使用的命令    char *envp[UEVENT_NUM_ENVP];  //環(huán)境變量數(shù)組    int envp_idx;                 //當(dāng)前環(huán)境變量索引    char buf[UEVENT_BUFFER_SIZE]; //環(huán)境變量數(shù)據(jù)緩沖區(qū)    int buflen;};/** * kobject_uevent - notify userspace by sending an uevent * * @action: action that is happening * @kobj: struct kobject that the action is happening to * * Returns 0 if kobject_uevent() is completed with success or the * corresponding error when it fails. */int kobject_uevent(struct kobject *kobj, enum kobject_action action){    return kobject_uevent_env(kobj, action, NULL); //實(shí)際完成發(fā)送函數(shù)}EXPORT_SYMBOL_GPL(kobject_uevent);/** * kobject_uevent_env - send an uevent with environmental data * * @action: action that is happening * @kobj: struct kobject that the action is happening to * @envp_ext: pointer to environmental data * * Returns 0 if kobject_uevent_env() is completed with success or the * corresponding error when it fails. */int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,               char *envp_ext[]){    struct kobj_uevent_env *env;    const char *action_string = kobject_actions[action];    const char *devpath = NULL;    const char *subsystem;    struct kobject *top_kobj;    struct kset *kset;    const struct kset_uevent_ops *uevent_ops;    int i = 0;    int retval = 0;#ifdef CONFIG_NET    struct uevent_sock *ue_sk;#endif    pr_debug("kobject: '%s' (%p): %s\n",         kobject_name(kobj), kobj, __func__);    /* search the kset we belong to */    top_kobj = kobj;    while (!top_kobj->kset && top_kobj->parent)  //尋找最近的kset,kset中有鍀event_ops        top_kobj = top_kobj->parent;    if (!top_kobj->kset) {        pr_debug("kobject: '%s' (%p): %s: attempted to send uevent "             "without kset!\n", kobject_name(kobj), kobj,             __func__);        return -EINVAL;    }    kset = top_kobj->kset;    uevent_ops = kset->uevent_ops;  //使用kset中的uevent_ops執(zhí)行發(fā)送操作    /* skip the event, if uevent_suppress is set*/    if (kobj->uevent_suppress) {  //跳過設(shè)置為uevent_suppress的kobject        pr_debug("kobject: '%s' (%p): %s: uevent_suppress "                 "caused the event to drop!\n",                 kobject_name(kobj), kobj, __func__);        return 0;    }    /* skip the event, if the filter returns zero. */    if (uevent_ops && uevent_ops->filter)  //調(diào)用uevent_ops的filter函數(shù)        if (!uevent_ops->filter(kset, kobj)) {            pr_debug("kobject: '%s' (%p): %s: filter function "                 "caused the event to drop!\n",                 kobject_name(kobj), kobj, __func__);            return 0;        }    /* originating subsystem */    if (uevent_ops && uevent_ops->name)  //確定發(fā)送事件的kobject名字        subsystem = uevent_ops->name(kset, kobj);    else        subsystem = kobject_name(&kset->kobj);    if (!subsystem) {        pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the "             "event to drop!\n", kobject_name(kobj), kobj,             __func__);        return 0;    }    /* environment buffer */    env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); //分配kobj_uevent_env    if (!env)        return -ENOMEM;    /* complete object path */    devpath = kobject_get_path(kobj, GFP_KERNEL);    if (!devpath) {        retval = -ENOENT;        goto exit;    }    /* default keys 添加環(huán)境變量 */    retval = add_uevent_var(env, "ACTION=%s", action_string);    if (retval)        goto exit;    retval = add_uevent_var(env, "DEVPATH=%s", devpath);    if (retval)        goto exit;    retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem);    if (retval)        goto exit;    /* keys passed in from the caller */    if (envp_ext) {        for (i = 0; envp_ext[i]; i++) {            retval = add_uevent_var(env, "%s", envp_ext[i]);            if (retval)                goto exit;        }    }    /* let the kset specific function add its stuff */    if (uevent_ops && uevent_ops->uevent) { //調(diào)用uevent回調(diào)函數(shù),添加子系統(tǒng)特定的環(huán)境變量        retval = uevent_ops->uevent(kset, kobj, env);        if (retval) {            pr_debug("kobject: '%s' (%p): %s: uevent() returned "                 "%d\n", kobject_name(kobj), kobj,                 __func__, retval);            goto exit;        }    }    /*     * Mark "add" and "remove" events in the object to ensure proper     * events to userspace during automatic cleanup. If the object did     * send an "add" event, "remove" will automatically generated by     * the core, if not already done by the caller.     */    if (action == KOBJ_ADD)        kobj->state_add_uevent_sent = 1;    else if (action == KOBJ_REMOVE)        kobj->state_remove_uevent_sent = 1;    mutex_lock(&uevent_sock_mutex);    /* we will send an event, so request a new sequence number */    retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);    if (retval) {        mutex_unlock(&uevent_sock_mutex);        goto exit;    }#if defined(CONFIG_NET)  //如果在編譯時(shí)指定CONFIG_NET,使用netlink發(fā)送    /* send netlink message */    list_for_each_entry(ue_sk, &uevent_sock_list, list) {        struct sock *uevent_sock = ue_sk->sk;        struct sk_buff *skb;        size_t len;        if (!netlink_has_listeners(uevent_sock, 1))            continue;        /* allocate message with the maximum possible size */        len = strlen(action_string) + strlen(devpath) + 2;        skb = alloc_skb(len + env->buflen, GFP_KERNEL);        if (skb) {            char *scratch;            /* add header */            scratch = skb_put(skb, len);            sprintf(scratch, "%s@%s", action_string, devpath);            /* copy keys to our continuous event payload buffer */            for (i = 0; i < env->envp_idx; i++) {                len = strlen(env->envp[i]) + 1;                scratch = skb_put(skb, len);                strcpy(scratch, env->envp[i]);            }            NETLINK_CB(skb).dst_group = 1;            retval = netlink_broadcast_filtered(uevent_sock, skb, //使用netlink多播發(fā)送                                0, 1, GFP_KERNEL,                                kobj_bcast_filter,                                kobj);            /* ENOBUFS should be handled in userspace */            if (retval == -ENOBUFS || retval == -ESRCH)                retval = 0;        } else            retval = -ENOMEM;    }#endif    mutex_unlock(&uevent_sock_mutex);#ifdef CONFIG_UEVENT_HELPER  //不能使用netlink時(shí),使用user_helper發(fā)送    /* call uevent_helper, usually only enabled during early boot */    if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {        struct subprocess_info *info;        retval = add_uevent_var(env, "HOME=/");        if (retval)            goto exit;        retval = add_uevent_var(env,                    "PATH=/sbin:/bin:/usr/sbin:/usr/bin");        if (retval)            goto exit;        retval = init_uevent_argv(env, subsystem); //組裝需要調(diào)用的用戶空間命令和參數(shù)        if (retval)            goto exit;        retval = -ENOMEM;        info = call_usermodehelper_setup(env->argv[0], env->argv,  //調(diào)用用戶空間程序/sbin/hotplug                         env->envp, GFP_KERNEL,                         NULL, cleanup_uevent_env, env);        if (info) {            retval = call_usermodehelper_exec(info, UMH_NO_WAIT);            env = NULL; /* freed by cleanup_uevent_env */        }    }#endifexit:    kfree(devpath);    kfree(env);    return retval;}EXPORT_SYMBOL_GPL(kobject_uevent_env);

sysfs與內(nèi)核對象

本篇文章不是以文件系統(tǒng)的角度來詳細(xì)描述sysfs,而是從內(nèi)核對象如何通過sysfs表示整個(gè)設(shè)備驅(qū)動模型為切入點(diǎn),進(jìn)一步理解Linux內(nèi)核對象。

內(nèi)核對象添加到sysfs

在上文《內(nèi)核對象與對象集》中,將kobject添加到sysfs中,kobject_add –> kobject_add_varg –> kobject_add_internal,調(diào)用create_dir創(chuàng)建sysfs目錄和屬性文件。

static int create_dir(struct kobject *kobj){    const struct kobj_ns_type_operations *ops;    int error;        //調(diào)用sysfs接口創(chuàng)建kobject對應(yīng)的目錄    error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));    if (error)        return error;    error = populate_dir(kobj);  //在kobject對應(yīng)的目錄中生成默認(rèn)屬性文件    if (error) {        sysfs_remove_dir(kobj);        return error;    }    /*     * @kobj->sd may be deleted by an ancestor going away.  Hold an     * extra reference so that it stays until @kobj is gone.     */    sysfs_get(kobj->sd);    /*     * If @kobj has ns_ops, its children need to be filtered based on     * their namespace tags.  Enable namespace support on @kobj->sd.     */    ops = kobj_child_ns_ops(kobj);    if (ops) {        BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE);        BUG_ON(ops->type >= KOBJ_NS_TYPES);        BUG_ON(!kobj_ns_type_registered(ops->type));        sysfs_enable_ns(kobj->sd);    }    return 0;}/* * populate_dir - populate directory with attributes. * @kobj: object we're working on. * * Most subsystems have a set of default attributes that are associated * with an object that registers with them.  This is a helper called during * object registration that loops through the default attributes of the * subsystem and creates attributes files for them in sysfs. */static int populate_dir(struct kobject *kobj){    struct kobj_type *t = get_ktype(kobj);    struct attribute *attr;    int error = 0;    int i;    if (t && t->default_attrs) {        for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) {            error = sysfs_create_file(kobj, attr); //為每個(gè)屬性創(chuàng)建對應(yīng)的文件            if (error)                break;        }    }    return error;}

create_dir通過調(diào)用sysfs_create_dir_ns創(chuàng)建sysfs中的目錄,調(diào)用sysfs_create_file創(chuàng)建屬性文件。

sysfs的核心結(jié)構(gòu)

kern_node代表sysfs中每個(gè)節(jié)點(diǎn)。

/* * kernfs_node - the building block of kernfs hierarchy.  Each and every * kernfs node is represented by single kernfs_node.  Most fields are * private to kernfs and shouldn't be accessed directly by kernfs users. * * As long as s_count reference is held, the kernfs_node itself is * accessible.  Dereferencing elem or any other outer entity requires * active reference. */struct kernfs_node {    atomic_t        count;   //引用計(jì)數(shù)    atomic_t        active;  //活動的引用計(jì)數(shù)#ifdef CONFIG_DEBUG_LOCK_ALLOC    struct lockdep_map  dep_map;#endif    /*     * Use kernfs_get_parent() and kernfs_name/path() instead of     * accessing the following two fields directly.  If the node is     * never moved to a different parent, it is safe to access the     * parent directly.     */    struct kernfs_node  *parent; //指向父節(jié)點(diǎn)    const char      *name;       //節(jié)點(diǎn)名稱,在sysfs顯示的名字    struct rb_node      rb;      //接入sysfs紅黑樹的鏈接項(xiàng)    const void      *ns;    /* namespace tag */    unsigned int        hash;   /* ns + name hash 紅黑樹key */    union {        struct kernfs_elem_dir      dir;     //該kern_node類型為目錄        struct kernfs_elem_symlink  symlink; //該kern_node類型為鏈接        struct kernfs_elem_attr     attr;    //該kern_node類型為屬性文件    };    void            *priv;    unsigned short      flags; //標(biāo)記位,目錄、鏈接、屬性文件或是否已被刪除    umode_t         mode;      //訪問權(quán)限,在sysfs中該kern_node的權(quán)限    unsigned int        ino;   //唯一編號    struct kernfs_iattrs    *iattr;  //用于設(shè)置非默認(rèn)的inode屬性,如果沒有則置為NULL};

在sysfs中創(chuàng)建目錄sysfs_create_dir_ns

/** * sysfs_create_dir_ns - create a directory for an object with a namespace tag * @kobj: object we're creating directory for * @ns: the namespace tag to use */int sysfs_create_dir_ns(struct kobject *kobj, const void *ns){    struct kernfs_node *parent, *kn;    BUG_ON(!kobj);    if (kobj->parent)        parent = kobj->parent->sd; //如果kobject設(shè)置parent,則使用之    else        parent = sysfs_root_kn;  //否則parent就設(shè)置為sysfs根目錄    if (!parent)        return -ENOENT;    //創(chuàng)建目錄    kn = kernfs_create_dir_ns(parent, kobject_name(kobj),                  S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);    if (IS_ERR(kn)) {        if (PTR_ERR(kn) == -EEXIST)            sysfs_warn_dup(parent, kobject_name(kobj));        return PTR_ERR(kn);    }    kobj->sd = kn;    return 0;}/** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * * Returns the created node on success, ERR_PTR() value on failure. */struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,                     const char *name, umode_t mode,                     void *priv, const void *ns){    struct kernfs_node *kn;    int rc;    /* allocate 分配空間并初始化, KERNFS_DIR指定創(chuàng)建目錄 */    kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);    if (!kn)        return ERR_PTR(-ENOMEM);    kn->dir.root = parent->dir.root; //指向根目錄kern_node    kn->ns = ns;  //指定命名空間    kn->priv = priv;    /* link in */    rc = kernfs_add_one(kn); //將kern_node加入父目錄的紅黑樹中    if (!rc)        return kn;    kernfs_put(kn);    return ERR_PTR(rc);}

kernfs_create_dir_ns函數(shù)中的兩個(gè)主要函數(shù)kernfs_new_node和kernfs_add_one,在創(chuàng)建文件和創(chuàng)建符號鏈接同樣使用,僅是參數(shù)不同。

為kern_node結(jié)構(gòu)分配空間,并初始化

struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,                    const char *name, umode_t mode,                    unsigned flags){    struct kernfs_node *kn;    //分配kern_node空間,并初始化    kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);    if (kn) {        kernfs_get(parent);        kn->parent = parent;    }    return kn;}static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,                         const char *name, umode_t mode,                         unsigned flags){    struct kernfs_node *kn;    int ret;    name = kstrdup_const(name, GFP_KERNEL); //復(fù)制常量字符串    if (!name)        return NULL;    kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); //在緩存空間分配kernfs_node    if (!kn)        goto err_out1;    /*     * If the ino of the sysfs entry created for a kmem cache gets     * allocated from an ida layer, which is accounted to the memcg that     * owns the cache, the memcg will get pinned forever. So do not account     * ino ida allocations.     */    ret = ida_simple_get(&root->ino_ida, 1, 0,  //獲取唯一標(biāo)號,用于唯一標(biāo)示kern_node                 GFP_KERNEL | __GFP_NOACCOUNT);    if (ret < 0)        goto err_out2;    kn->ino = ret;    atomic_set(&kn->count, 1);  //更新引用計(jì)數(shù)    atomic_set(&kn->active, KN_DEACTIVATED_BIAS);    RB_CLEAR_NODE(&kn->rb);    //設(shè)置kern_node相關(guān)域    kn->name = name;    kn->mode = mode;    kn->flags = flags;    return kn; err_out2:    kmem_cache_free(kernfs_node_cache, kn); err_out1:    kfree_const(name);    return NULL;}

將kern_node添加到parent的紅黑樹中:

/** *  kernfs_add_one - add kernfs_node to parent without warning *  @kn: kernfs_node to be added * *  The caller must already have initialized @kn->parent.  This *  function increments nlink of the parent's inode if @kn is a *  directory and link into the children list of the parent. * *  RETURNS: *  0 on success, -EEXIST if entry with the given name already *  exists. */int kernfs_add_one(struct kernfs_node *kn){    struct kernfs_node *parent = kn->parent;    struct kernfs_iattrs *ps_iattr;    bool has_ns;    int ret;    mutex_lock(&kernfs_mutex);    ret = -EINVAL;    has_ns = kernfs_ns_enabled(parent);    if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",         has_ns ? "required" : "invalid", parent->name, kn->name))        goto out_unlock;    if (kernfs_type(parent) != KERNFS_DIR) //檢查parent是否為目錄        goto out_unlock;    ret = -ENOENT;    if (parent->flags & KERNFS_EMPTY_DIR)  //檢查parent是否為空目錄        goto out_unlock;    //檢查parent是否是active狀態(tài)    if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))        goto out_unlock;    kn->hash = kernfs_name_hash(kn->name, kn->ns); //作為紅黑樹比較的key    ret = kernfs_link_sibling(kn); //kern_node鏈入parent節(jié)點(diǎn)紅黑樹中    if (ret)        goto out_unlock;    /* Update timestamps on the parent */    ps_iattr = parent->iattr;    if (ps_iattr) {        struct iattr *ps_iattrs = &ps_iattr->ia_iattr;        ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;    }    mutex_unlock(&kernfs_mutex);    /*     * Activate the new node unless CREATE_DEACTIVATED is requested.     * If not activated here, the kernfs user is responsible for     * activating the node with kernfs_activate().  A node which hasn't     * been activated is not visible to userland and its removal won't     * trigger deactivation.     */    if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))        kernfs_activate(kn);    return 0;out_unlock:    mutex_unlock(&kernfs_mutex);    return ret;}

sysfs紅黑樹中的key:

/** *  kernfs_name_hash *  @name: Null terminated string to hash *  @ns:   Namespace tag to hash * *  Returns 31 bit hash of ns + name (so it fits in an off_t ) */static unsigned int kernfs_name_hash(const char *name, const void *ns){    unsigned long hash = init_name_hash();    unsigned int len = strlen(name);    while (len--)        hash = partial_name_hash(*name++, hash);    hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));    hash &= 0x7fffffffU;    /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */    if (hash < 2)        hash += 2;    if (hash >= INT_MAX)        hash = INT_MAX - 1;    return hash;}static int kernfs_name_compare(unsigned int hash, const char *name,                   const void *ns, const struct kernfs_node *kn){    if (hash < kn->hash)        return -1;    if (hash > kn->hash)        return 1;    if (ns < kn->ns)        return -1;    if (ns > kn->ns)        return 1;    return strcmp(name, kn->name);}
  • kernfs_name_hash: 根據(jù)name和ns計(jì)算kern_node的hash值,保存在kern_node.hash域中。

  • kernfs_name_compare: sysfs紅黑樹key的比較函數(shù), 比較優(yōu)先級是: hash > ns > name

kern_node鏈入parent節(jié)點(diǎn)紅黑樹中:

/** *  kernfs_link_sibling - link kernfs_node into sibling rbtree *  @kn: kernfs_node of interest * *  Link @kn into its sibling rbtree which starts from *  @kn->parent->dir.children. * *  Locking: *  mutex_lock(kernfs_mutex) * *  RETURNS: *  0 on susccess -EEXIST on failure. */static int kernfs_link_sibling(struct kernfs_node *kn){    struct rb_node **node = &kn->parent->dir.children.rb_node; //parent目錄的紅黑樹    struct rb_node *parent = NULL;    while (*node) {  //在parent的目錄中,尋找合適的位置將kn插入parent的紅黑樹中        struct kernfs_node *pos;        int result;        pos = rb_to_kn(*node);        parent = *node;        result = kernfs_sd_compare(kn, pos); //優(yōu)先順序: hash > ns > name        if (result < 0)            node = &pos->rb.rb_left;        else if (result > 0)            node = &pos->rb.rb_right;        else            return -EEXIST;    }    /* add new node and rebalance the tree */    rb_link_node(&kn->rb, parent, node);    rb_insert_color(&kn->rb, &kn->parent->dir.children);    /* successfully added, account subdir number */    if (kernfs_type(kn) == KERNFS_DIR)        kn->parent->dir.subdirs++;    return 0;}

在sysfs中創(chuàng)建文件

static inline int __must_check sysfs_create_file(struct kobject *kobj,                         const struct attribute *attr){    return sysfs_create_file_ns(kobj, attr, NULL);}/** * sysfs_create_file_ns - create an attribute file for an object with custom ns * @kobj: object we're creating for * @attr: attribute descriptor * @ns: namespace the new file should belong to */int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,             const void *ns){    BUG_ON(!kobj || !kobj->sd || !attr);    return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);}EXPORT_SYMBOL_GPL(sysfs_create_file_ns);int sysfs_add_file_mode_ns(struct kernfs_node *parent,               const struct attribute *attr, bool is_bin,               umode_t mode, const void *ns){    struct lock_class_key *key = NULL;    const struct kernfs_ops *ops;    struct kernfs_node *kn;    loff_t size;    if (!is_bin) {        struct kobject *kobj = parent->priv;        const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;        /* every kobject with an attribute needs a ktype assigned */        if (WARN(!sysfs_ops, KERN_ERR             "missing sysfs attribute operations for kobject: %s\n",             kobject_name(kobj)))            return -EINVAL;        //確定讀寫的操作函數(shù)        if (sysfs_ops->show && sysfs_ops->store) {            if (mode & SYSFS_PREALLOC)                ops = &sysfs_prealloc_kfops_rw;            else                ops = &sysfs_file_kfops_rw;        } else if (sysfs_ops->show) {            if (mode & SYSFS_PREALLOC)                ops = &sysfs_prealloc_kfops_ro;            else                ops = &sysfs_file_kfops_ro;        } else if (sysfs_ops->store) {            if (mode & SYSFS_PREALLOC)                ops = &sysfs_prealloc_kfops_wo;            else                ops = &sysfs_file_kfops_wo;        } else            ops = &sysfs_file_kfops_empty;        size = PAGE_SIZE;    } else {        struct bin_attribute *battr = (void *)attr;        if (battr->mmap)            ops = &sysfs_bin_kfops_mmap;        else if (battr->read && battr->write)            ops = &sysfs_bin_kfops_rw;        else if (battr->read)            ops = &sysfs_bin_kfops_ro;        else if (battr->write)            ops = &sysfs_bin_kfops_wo;        else            ops = &sysfs_file_kfops_empty;        size = battr->size;    }#ifdef CONFIG_DEBUG_LOCK_ALLOC    if (!attr->ignore_lockdep)        key = attr->key ?: (struct lock_class_key *)&attr->skey;#endif    kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,                  (void *)attr, ns, key); //創(chuàng)建屬性文件    if (IS_ERR(kn)) {        if (PTR_ERR(kn) == -EEXIST)            sysfs_warn_dup(parent, attr->name);        return PTR_ERR(kn);    }    return 0;}

通過上面的代碼跟蹤,創(chuàng)建屬性文件由__kernfs_create_file實(shí)現(xiàn),最終仍然是調(diào)用kernfs_new_node和kernfs_add_one。

/** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Returns the created node on success, ERR_PTR() value on error. */struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,                     const char *name,                     umode_t mode, loff_t size,                     const struct kernfs_ops *ops,                     void *priv, const void *ns,                     struct lock_class_key *key){    struct kernfs_node *kn;    unsigned flags;    int rc;    flags = KERNFS_FILE; //創(chuàng)建的kern_node類型為file    //分配空間并初始化    kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);    if (!kn)        return ERR_PTR(-ENOMEM);    kn->attr.ops = ops;    kn->attr.size = size;    kn->ns = ns;    kn->priv = priv;#ifdef CONFIG_DEBUG_LOCK_ALLOC    if (key) {        lockdep_init_map(&kn->dep_map, "s_active", key, 0);        kn->flags |= KERNFS_LOCKDEP;    }#endif    /*     * kn->attr.ops is accesible only while holding active ref.  We     * need to know whether some ops are implemented outside active     * ref.  Cache their existence in flags.     */    if (ops->seq_show)        kn->flags |= KERNFS_HAS_SEQ_SHOW;    if (ops->mmap)        kn->flags |= KERNFS_HAS_MMAP;    rc = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中    if (rc) {        kernfs_put(kn);        return ERR_PTR(rc);    }    return kn;}

在sysfs_add_file_mode_ns函數(shù)中根據(jù)flags的不同,注冊不同的讀寫回調(diào)函數(shù),下面以sysfs_prealloc_kfops_rw函數(shù)為例,其他結(jié)構(gòu)類似,不贅述。

//常規(guī)文件--sysfs_prealloc_kfops_rwstatic const struct kernfs_ops sysfs_prealloc_kfops_rw = {    .read       = sysfs_kf_read,    .write      = sysfs_kf_write,    .prealloc   = true,};/* kernfs read callback for regular sysfs files with pre-alloc */static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,                 size_t count, loff_t pos){    const struct sysfs_ops *ops = sysfs_file_ops(of->kn); //獲取kobject中的sysfs_ops操作表    struct kobject *kobj = of->kn->parent->priv;    size_t len;    /*     * If buf != of->prealloc_buf, we don't know how     * large it is, so cannot safely pass it to ->show     */    if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))        return 0;    len = ops->show(kobj, of->kn->priv, buf); //kobject中sd域的sysfs_ops操作表中的show    return min(count, len);}/* kernfs write callback for regular sysfs files */static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,                  size_t count, loff_t pos){   //獲取kobject中的sysfs_ops操作表    const struct sysfs_ops *ops = sysfs_file_ops(of->kn);    struct kobject *kobj = of->kn->parent->priv;    if (!count)        return 0;    return ops->store(kobj, of->kn->priv, buf, count); //kobject中sd域的sysfs_ops操作表中的store}

關(guān)于屬性文件的讀寫操作,最終都回調(diào)到kobject中的sd域的sysfs_ops操作表,這個(gè)操作表示在kobject_init函數(shù)中設(shè)置?;仡檏object_create函數(shù):

struct kobject *kobject_create(void){    struct kobject *kobj;    kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); //分配空間    if (!kobj)        return NULL;    kobject_init(kobj, &dynamic_kobj_ktype);  //初始化, kobj_type類型為dynamic_kobj_ktype    return kobj;}//注冊如下結(jié)構(gòu)static struct kobj_type dynamic_kobj_ktype = {    .release    = dynamic_kobj_release,    .sysfs_ops  = &kobj_sysfs_ops,};const struct sysfs_ops kobj_sysfs_ops = {    .show   = kobj_attr_show,    .store  = kobj_attr_store,};EXPORT_SYMBOL_GPL(kobj_sysfs_ops);

kobject的sysfs的show和store方法為:kobj_attr_show和kobj_attr_store

static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,                  char *buf){    struct kobj_attribute *kattr;    ssize_t ret = -EIO;    kattr = container_of(attr, struct kobj_attribute, attr);    if (kattr->show)  //如果業(yè)務(wù)子系統(tǒng)設(shè)置了show函數(shù),則調(diào)用        ret = kattr->show(kobj, kattr, buf);    return ret;}static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,                   const char *buf, size_t count){    struct kobj_attribute *kattr;    ssize_t ret = -EIO;    kattr = container_of(attr, struct kobj_attribute, attr);    if (kattr->store)  //如果業(yè)務(wù)子系統(tǒng)設(shè)置了store函數(shù),則調(diào)用        ret = kattr->store(kobj, kattr, buf, count);    return ret;}

真正的對屬性文件進(jìn)行讀寫的回調(diào)由業(yè)務(wù)子系統(tǒng)實(shí)現(xiàn)。

在sysfs中創(chuàng)建符號鏈接

/** *  sysfs_create_link - create symlink between two objects. *  @kobj:  object whose directory we're creating the link in. *  @target:    object we're pointing to. *  @name:      name of the symlink. */int sysfs_create_link(struct kobject *kobj, struct kobject *target,              const char *name){    return sysfs_do_create_link(kobj, target, name, 1);}EXPORT_SYMBOL_GPL(sysfs_create_link);static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,                const char *name, int warn){    struct kernfs_node *parent = NULL;    if (!kobj)        parent = sysfs_root_kn;    else        parent = kobj->sd;    if (!parent)        return -EFAULT;    return sysfs_do_create_link_sd(parent, target, name, warn);}static int sysfs_do_create_link_sd(struct kernfs_node *parent,                   struct kobject *target_kobj,                   const char *name, int warn){    struct kernfs_node *kn, *target = NULL;    BUG_ON(!name || !parent);    /*     * We don't own @target_kobj and it may be removed at any time.     * Synchronize using sysfs_symlink_target_lock.  See     * sysfs_remove_dir() for details.     */    spin_lock(&sysfs_symlink_target_lock);    if (target_kobj->sd) {        target = target_kobj->sd;        kernfs_get(target);    }    spin_unlock(&sysfs_symlink_target_lock);    if (!target)        return -ENOENT;    kn = kernfs_create_link(parent, name, target); //創(chuàng)建sysfs符號鏈接    kernfs_put(target);    if (!IS_ERR(kn))        return 0;    if (warn && PTR_ERR(kn) == -EEXIST)        sysfs_warn_dup(parent, name);    return PTR_ERR(kn);}

由上面的代碼追蹤,創(chuàng)建符號鏈接由kernfs_create_link函數(shù)上。

/** * kernfs_create_link - create a symlink * @parent: directory to create the symlink in * @name: name of the symlink * @target: target node for the symlink to point to * * Returns the created node on success, ERR_PTR() value on error. */struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,                       const char *name,                       struct kernfs_node *target){    struct kernfs_node *kn;    int error;    //指定創(chuàng)建符號鏈接    kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);    if (!kn)        return ERR_PTR(-ENOMEM);    if (kernfs_ns_enabled(parent))        kn->ns = target->ns;    kn->symlink.target_kn = target;    kernfs_get(target); /* ref owned by symlink */    error = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中    if (!error)        return kn;    kernfs_put(kn);    return ERR_PTR(error);}

與創(chuàng)建目錄和文件類似,最終仍然是調(diào)用kernfs_new_node和kernfs_add_one實(shí)現(xiàn)。

基于內(nèi)核對象編程套路

目標(biāo):在sysfs中創(chuàng)建一個(gè)目錄/sys/kernel/storage/,在該目錄下,還創(chuàng)建了一個(gè)文件value。value可以寫入整型數(shù)據(jù),隨后可以讀出。 
* 定義內(nèi)核對象

struct storage_obj {    struct kobject kobj;    int val;  //用于保存寫入的數(shù)據(jù)};
  • 定義屬性類型

struct storage_attribute {    struct attribute *attr;    ssize_t (*show)(struct kobject *, struct attribute *, char *);    ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t);}
  • 聲明屬性 
    定義屬性的show和store方法,如下:

//定義并初始化storage_attributestruct storage_attribute *sattr = &struct storage_attribute {    .attr = {.name = "value", .mode = 0666},    .show = storage_show,    .store = storage_store,};
  • 實(shí)現(xiàn)sysfs操作

ssize_t storage_show(struct kobject *kobj, struct attribute *attr, char *buf) {    struct storage *stor = container_of(kobj, struct storage_obj, kobj);    stor->val = atoi(buf);}ssize_t storage_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t s) {    struct storage *stor = container_of(kobj, struct storage_obj, kobj);    memcpy(buf, s, itoa(stor->val));}
  • 定義內(nèi)核對象release方法 
    release方法設(shè)置在kobj_type結(jié)構(gòu)中

void storage_release(struct kobject *kobj){    ......}
  • 聲明內(nèi)核對象類型

struct storage_ktype {    struct kobj_type *ktype;}
  • 封裝對象屬性添加和刪除方法 
    需要將value屬性添加到內(nèi)核對象,或者從內(nèi)核對象刪除,可以直接調(diào)用sysfs_create_file和sysfs_remove_file。但大多數(shù)情況下,會對這兩個(gè)方法做一層封裝:storage_create_file和storage_remove_file。

int storage_create_file(struct storage_obj *sobj, const struct storage_attribute *attr){    int error = 0;    if (sobj) {        error = sysfs_create_file(&sobj->kobj, &attr->attr);    }    return error;}void storage_remove_file(struct storage_obj *sobj, const struct storage_attribute *attr){    if (sobj) {        sysfs_remove_file(&sobj->kobj, &attr->attr);    }}
  • 定義對象的創(chuàng)建和銷毀方法

struct storage_obj * create_storage_obj() {    struct storage_obj *sobj = (struct storage_obj *)malloc(struct storage_obj);    struct storage_ktype *stype = (struct storage_ktype *)malloc(struct storage_ktype);    sobj->parent = kernel_kobj;    kobject_init_and_add(&sobj->kobj, &stype->ktype);    return sobj}void destroy_storage_obj(struct kobject *kobj) {    struct storage_obj *sobj = container_of(kobj, struct storage_obj, kobj);    kobject_del(kboj);    free(sobj);    free(stype);}
  • 實(shí)現(xiàn)模塊加載和卸載方法 
    加載時(shí)調(diào)用create_storage_obj, 卸載時(shí)調(diào)用destroy_storage_obj


設(shè)備驅(qū)動模型

概述

Linux的設(shè)備驅(qū)動模型能夠帶來以下的優(yōu)點(diǎn): 
* 使用統(tǒng)一機(jī)制來表達(dá)設(shè)備與驅(qū)動之間的關(guān)系,規(guī)范設(shè)備驅(qū)動的編寫,核心代碼復(fù)用。 
* 將系統(tǒng)中的設(shè)備以樹結(jié)構(gòu)組織,并且通過sysfs將其呈現(xiàn)在用戶空間——包括所有的總線和內(nèi)部連接。 
* 支持設(shè)備的熱拔插機(jī)制。 
* 支持通用的電源管理機(jī)制,通過由葉子節(jié)點(diǎn)到根節(jié)點(diǎn)的方向遍歷設(shè)備樹,確保子設(shè)備在父設(shè)備之前斷電。

內(nèi)核基于內(nèi)核對象和sysfs,通過抽象以下五種概念,實(shí)現(xiàn)了設(shè)備驅(qū)動模型的框架,使得編寫子系統(tǒng)成為“八股文”。 
1. bus_type: 總線類型,每個(gè)子系統(tǒng)有且只有一個(gè)總線類型,由bus_type和subsys_private兩個(gè)結(jié)構(gòu)共同描述。 
2. device: 設(shè)備,描述掛在總線類型中的設(shè)備,由device和device_private兩個(gè)結(jié)構(gòu)共同描述。 
3. driver: 驅(qū)動, 描述掛在總線類型中的驅(qū)動模塊,由device_driver和driver_private兩個(gè)結(jié)構(gòu)共同描述。 
4. class: 類,每個(gè)總線類型有且只有一個(gè)類,由class和subsys_private兩個(gè)結(jié)構(gòu)共同描述。 
5. class_interface: 接口,每個(gè)類有多個(gè)接口,由class_interface結(jié)構(gòu)描述。

在Linux內(nèi)核中,子系統(tǒng)是由bus_type, device, driver, class和class_interface之間的關(guān)系所描述,而設(shè)備驅(qū)動模型正是這些關(guān)系的核心實(shí)現(xiàn),使得在編寫子系統(tǒng)程序時(shí),只要遵循設(shè)備模型的套路,便不需要關(guān)注于這些復(fù)雜的關(guān)系,只需實(shí)現(xiàn)自身的業(yè)務(wù)邏輯。

每個(gè)子系統(tǒng)都有一個(gè)總線類型,總線類型擁有一個(gè)設(shè)備鏈表和一個(gè)驅(qū)動鏈表,用于連接由該總線類型已發(fā)現(xiàn)的設(shè)備和已加載的驅(qū)動,設(shè)備發(fā)現(xiàn)和驅(qū)動加載的順序是任意的。每個(gè)設(shè)備最多綁定到一個(gè)驅(qū)動,被綁定了驅(qū)動的設(shè)備可以正常工作。除此之外,每個(gè)設(shè)備可以唯一屬于某個(gè)類,類中包含多個(gè)接口,接口的方法作用于設(shè)備,不管是先添加接口,還是先發(fā)現(xiàn)設(shè)備。

總線類型

總線類型的數(shù)據(jù)結(jié)構(gòu)

struct bus_type {    const char      *name;         //子系統(tǒng)名稱    const char      *dev_name;     //供子系統(tǒng)生成設(shè)備名稱使用    struct device       *dev_root;    struct device_attribute *dev_attrs; /* use dev_groups instead */    const struct attribute_group **bus_groups;  //總線類型使用的屬性組    const struct attribute_group **dev_groups;  //設(shè)備使用的屬性組    const struct attribute_group **drv_groups;  //驅(qū)動使用的屬性組    int (*match)(struct device *dev, struct device_driver *drv);    //檢測設(shè)備與驅(qū)動是否可以綁定    int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置bus特有的環(huán)境變量    int (*probe)(struct device *dev);     //當(dāng)設(shè)備可以綁定到驅(qū)動時(shí),對設(shè)備進(jìn)行初始化和執(zhí)行綁定    int (*remove)(struct device *dev);    //當(dāng)設(shè)備從驅(qū)動中解綁時(shí),回調(diào)    void (*shutdown)(struct device *dev); //當(dāng)設(shè)備斷電時(shí),回調(diào)    int (*online)(struct device *dev);    //當(dāng)設(shè)備上電時(shí),回調(diào)    int (*offline)(struct device *dev);   //當(dāng)設(shè)備下電時(shí),回調(diào)    int (*suspend)(struct device *dev, pm_message_t state); //當(dāng)設(shè)備進(jìn)入節(jié)能狀態(tài)時(shí),回調(diào)    int (*resume)(struct device *dev);                      //當(dāng)設(shè)備恢復(fù)正常狀態(tài)時(shí),回調(diào)    const struct dev_pm_ops *pm;  //電源管理相關(guān)    const struct iommu_ops *iommu_ops;    struct subsys_private *p;         //子系統(tǒng)私有類型    struct lock_class_key lock_key;};struct subsys_private {    struct kset subsys;          //總線kset,scsi子系統(tǒng)對應(yīng)/sys/bus/scsi    struct kset *devices_kset;   //設(shè)備kset, scsi子系統(tǒng)對應(yīng)/sys/bus/scsi/devices    struct list_head interfaces; //總線的接口鏈表    struct mutex mutex;              struct kset *drivers_kset;   //驅(qū)動kset, scsi子系統(tǒng)對應(yīng)/sys/bus/scsi/drivers    struct klist klist_devices;  //總線的設(shè)備鏈表    struct klist klist_drivers;  //總線的驅(qū)動鏈表    struct blocking_notifier_head bus_notifier; //子系統(tǒng)變化時(shí),需要通知的鏈表    unsigned int drivers_autoprobe:1;  //是否允許設(shè)備或驅(qū)動加載時(shí),自動探測    struct bus_type *bus;        //指向總線類型    struct kset glue_dirs;    struct class *class;         //指向總線類型的類};

從上面的兩個(gè)結(jié)構(gòu)可以看到,bus_type包含的主要是實(shí)現(xiàn)子系統(tǒng)應(yīng)該具體關(guān)注的比如name,一組回調(diào)函數(shù)。而subsys_private結(jié)構(gòu)主要是設(shè)備驅(qū)動模型中的關(guān)系的表達(dá),如字段subsys的類型是kset,描述該子系統(tǒng)在sysfs中的表達(dá);klist_devices和klist_drivers分別是設(shè)備鏈表和驅(qū)動鏈表,用于管理總線類型的所有設(shè)備和驅(qū)動。之后仍然會遇到xxx_private的結(jié)構(gòu),以這種方式命名的結(jié)構(gòu),都是給設(shè)備驅(qū)動模型核心使用的,業(yè)務(wù)子系統(tǒng)無需也不能使用。

總線類型注冊/反注冊

實(shí)現(xiàn)子系統(tǒng)的第一步就是創(chuàng)建bus_type,并將其注冊到系統(tǒng),此時(shí)需要調(diào)用bus_register:

/** * bus_register - register a driver-core subsystem * @bus: bus to register * * Once we have that, we register the bus with the kobject * infrastructure, then register the children subsystems it has: * the devices and drivers that belong to the subsystem. */int bus_register(struct bus_type *bus){    int retval;    struct subsys_private *priv;    struct lock_class_key *key = &bus->lock_key;    //分配總線類型私有數(shù)據(jù)空間    priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);    if (!priv)        return -ENOMEM;    priv->bus = bus; //關(guān)聯(lián)bus_type和subsys_private    bus->p = priv;    BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);    //設(shè)置總線類型名稱到kobject中,在sysfs中顯示    retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name);    if (retval)        goto out;    priv->subsys.kobj.kset = bus_kset;    priv->subsys.kobj.ktype = &bus_ktype;    priv->drivers_autoprobe = 1;    //開啟自動探測    retval = kset_register(&priv->subsys);  //將總線類型添加到設(shè)備模型中    if (retval)        goto out;    retval = bus_create_file(bus, &bus_attr_uevent); //創(chuàng)建uevent屬性文件    if (retval)        goto bus_uevent_fail;    priv->devices_kset = kset_create_and_add("devices", NULL,  //創(chuàng)建devices目錄                         &priv->subsys.kobj);    if (!priv->devices_kset) {        retval = -ENOMEM;        goto bus_devices_fail;    }    priv->drivers_kset = kset_create_and_add("drivers", NULL,  //創(chuàng)建drivers目錄                         &priv->subsys.kobj);    if (!priv->drivers_kset) {        retval = -ENOMEM;        goto bus_drivers_fail;    }    //初始化鏈表和鎖    INIT_LIST_HEAD(&priv->interfaces);    __mutex_init(&priv->mutex, "subsys mutex", key);    klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);    klist_init(&priv->klist_drivers, NULL, NULL);    retval = add_probe_files(bus); //在sysfs中添加探測文件drivers_autoprobe和drivers_probe    if (retval)        goto bus_probe_files_fail;    retval = bus_add_groups(bus, bus->bus_groups); //添加總線類型的屬性文件    if (retval)        goto bus_groups_fail;    pr_debug("bus: '%s': registered\n", bus->name);    return 0;    //失敗回滾操作bus_groups_fail:    remove_probe_files(bus);bus_probe_files_fail:    kset_unregister(bus->p->drivers_kset);bus_drivers_fail:    kset_unregister(bus->p->devices_kset);bus_devices_fail:    bus_remove_file(bus, &bus_attr_uevent);bus_uevent_fail:    kset_unregister(&bus->p->subsys);out:    kfree(bus->p);    bus->p = NULL;    return retval;}EXPORT_SYMBOL_GPL(bus_register);

注冊總線類型后,便可以在系統(tǒng)看到:

root@ubuntu16:~# ls /sys/bus/scsi -ltotal 0drwxr-xr-x 2 root root    0 Sep  5 16:01 devicesdrwxr-xr-x 4 root root    0 Sep  2 09:44 drivers-rw-r--r-- 1 root root 4096 Sep  5 11:29 drivers_autoprobe--w------- 1 root root 4096 Sep  5 11:29 drivers_probe--w------- 1 root root 4096 Sep  2 09:44 ueventroot@ubuntu16:~#

當(dāng)從系統(tǒng)中注銷子系統(tǒng)時(shí),需要調(diào)用bus_unregister,完成總線類型的反注冊:

/** * bus_unregister - remove a bus from the system * @bus: bus. * * Unregister the child subsystems and the bus itself. * Finally, we call bus_put() to release the refcount */void bus_unregister(struct bus_type *bus){    pr_debug("bus: '%s': unregistering\n", bus->name);    if (bus->dev_root)        device_unregister(bus->dev_root);     //刪除根設(shè)備    bus_remove_groups(bus, bus->bus_groups);  //刪除總線的屬性文件    remove_probe_files(bus);                  //刪除探測文件drivers_autoprobe和drivers_probe    kset_unregister(bus->p->drivers_kset);    //刪除drivers目錄    kset_unregister(bus->p->devices_kset);    //刪除devices目錄    bus_remove_file(bus, &bus_attr_uevent);   //刪除uevent文件    kset_unregister(&bus->p->subsys);         //刪除總線目錄}EXPORT_SYMBOL_GPL(bus_unregister);

設(shè)備

設(shè)備的數(shù)據(jù)結(jié)構(gòu)

struct device {    struct device       *parent;  //指向父設(shè)備,eg.HBA    struct device_private   *p;   //設(shè)備私有指針    struct kobject kobj;          //內(nèi)嵌kobject    const char      *init_name; /* initial name of the device */    const struct device_type *type;  //設(shè)備類型,抽象出來的域和方法    struct mutex        mutex;  /* mutex to synchronize calls to its driver */    struct bus_type *bus;       /* type of bus device is on; devive歸屬的bus */    struct device_driver *driver;   /* which driver has allocated this device */    void        *platform_data; /* Platform specific data, device core doesn't touch it */    void        *driver_data;   /* Driver data, set and get with dev_set/get_drvdata */    struct dev_pm_info  power;    struct dev_pm_domain    *pm_domain;#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN    struct irq_domain   *msi_domain;#endif#ifdef CONFIG_PINCTRL    struct dev_pin_info *pins;#endif#ifdef CONFIG_GENERIC_MSI_IRQ    struct list_head    msi_list;#endif#ifdef CONFIG_NUMA    int     numa_node;  /* NUMA node this device is close to */#endif    u64     *dma_mask;  /* dma mask (if dma'able device) */    u64     coherent_dma_mask;/* Like dma_mask, but for                         alloc_coherent mappings as                         not all hardware supports                         64 bit addresses for consistent                         allocations such descriptors. */    unsigned long   dma_pfn_offset;    struct device_dma_parameters *dma_parms;    struct list_head    dma_pools;  /* dma pools (if dma'ble) */    struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */#ifdef CONFIG_DMA_CMA    struct cma *cma_area;       /* contiguous memory area for dma allocations */#endif    /* arch specific additions */    struct dev_archdata archdata;    struct device_node  *of_node; /* associated device tree node */    struct fwnode_handle    *fwnode; /* firmware device node */    dev_t           devt;   /* dev_t, creates the sysfs "dev"; 設(shè)備號 */    u32         id; /* device instance */    spinlock_t      devres_lock;    struct list_head    devres_head; //設(shè)備資源鏈表頭    struct klist_node   knode_class; //鏈入類的設(shè)備鏈表    struct class        *class;      //指向鏈入的類    const struct attribute_group **groups;  /* optional groups 設(shè)備特有的屬性 */    void    (*release)(struct device *dev);  //設(shè)備是否回調(diào)    struct iommu_group  *iommu_group;    bool            offline_disabled:1;    bool            offline:1;};struct device_private {    struct klist klist_children;     //子設(shè)備鏈表    struct klist_node knode_parent;  //鏈入父設(shè)備的children鏈表    struct klist_node knode_driver;  //鏈入驅(qū)動的設(shè)備鏈表中    struct klist_node knode_bus;     //鏈入總線的設(shè)備鏈表    struct list_head deferred_probe; //鏈入延遲探測鏈表    struct device *device;           //指向關(guān)聯(lián)的device};struct device_type {    const char *name;  //設(shè)備類型的名稱    const struct attribute_group **groups;  //設(shè)備的公有屬性組    int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前調(diào)用,用于設(shè)置事件環(huán)境變量    char *(*devnode)(struct device *dev, umode_t *mode, //在創(chuàng)建設(shè)備時(shí),提供名字線索             kuid_t *uid, kgid_t *gid);    void (*release)(struct device *dev);    //設(shè)備釋放時(shí)回調(diào)    const struct dev_pm_ops *pm;};

在設(shè)備驅(qū)動模型中,device結(jié)構(gòu)有bus域,指向device所屬的總線類型;class域指向device所屬的唯一的類;driver域指向設(shè)備所綁定的驅(qū)動。與內(nèi)核對象一樣,設(shè)備也被組織層層次結(jié)構(gòu),通過parent指向父設(shè)備。

device_private結(jié)構(gòu)由設(shè)備驅(qū)動模型處理,維護(hù)和其他結(jié)構(gòu)之間的內(nèi)部關(guān)系。device_type結(jié)構(gòu)定義設(shè)備公有的屬性和方法。

設(shè)備的注冊與反注冊

當(dāng)設(shè)備被發(fā)現(xiàn)后,需要將設(shè)備注冊到系統(tǒng),需要調(diào)用device_register函數(shù):

/** * device_register - register a device with the system. * @dev: pointer to the device structure * * This happens in two clean steps - initialize the device * and add it to the system. The two steps can be called * separately, but this is the easiest and most common. * I.e. you should only call the two helpers separately if * have a clearly defined need to use and refcount the device * before it is added to the hierarchy. * * For more information, see the kerneldoc for device_initialize() * and device_add(). * * NOTE: _Never_ directly free @dev after calling this function, even * if it returned an error! Always use put_device() to give up the * reference initialized in this function instead. */int device_register(struct device *dev){    device_initialize(dev);  //初始化device結(jié)構(gòu)    return device_add(dev);  //將設(shè)備添加到系統(tǒng)}EXPORT_SYMBOL_GPL(device_register);void device_initialize(struct device *dev){    dev->kobj.kset = devices_kset;             // /sys/devices/    kobject_init(&dev->kobj, &device_ktype);   // device的類型為device_ktype    INIT_LIST_HEAD(&dev->dma_pools);    mutex_init(&dev->mutex);    lockdep_set_novalidate_class(&dev->mutex);    spin_lock_init(&dev->devres_lock);    INIT_LIST_HEAD(&dev->devres_head);    device_pm_init(dev);    set_dev_node(dev, -1);#ifdef CONFIG_GENERIC_MSI_IRQ    INIT_LIST_HEAD(&dev->msi_list);#endif}EXPORT_SYMBOL_GPL(device_initialize);

device_register函數(shù)調(diào)用device_initialize對device結(jié)構(gòu)進(jìn)行初始化,調(diào)用device_add函數(shù)完成設(shè)備添加到系統(tǒng)。

int device_add(struct device *dev){    struct device *parent = NULL;    struct kobject *kobj;    struct class_interface *class_intf;    int error = -EINVAL;    dev = get_device(dev);    if (!dev)        goto done;    if (!dev->p) {  //如果device沒有設(shè)置devcie_private,在這里分配并初始化        error = device_private_init(dev);        if (error)            goto done;    }    /*     * for statically allocated devices, which should all be converted     * some day, we need to initialize the name. We prevent reading back     * the name, and force the use of dev_name()     */    if (dev->init_name) {        dev_set_name(dev, "%s", dev->init_name); //設(shè)置device的kobject名字        dev->init_name = NULL;    }    /* subsystems can specify simple device enumeration */    if (!dev_name(dev) && dev->bus && dev->bus->dev_name) //如果device沒有設(shè)置init_name, 則使用bus的dev_name和設(shè)備id生成        dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);    if (!dev_name(dev)) {        error = -EINVAL;        goto name_error;    }    pr_debug("device: '%s': %s\n", dev_name(dev), __func__);    parent = get_device(dev->parent);    kobj = get_device_parent(dev, parent);    if (kobj)        dev->kobj.parent = kobj;  //設(shè)置device的kobject的parent字段    /* use parent numa_node */    if (parent && (dev_to_node(dev) == NUMA_NO_NODE))        set_dev_node(dev, dev_to_node(parent));    /* first, register with generic layer. */    /* we require the name to be set before, and pass NULL */    error = kobject_add(&dev->kobj, dev->kobj.parent, NULL); //將device添加到parent的目錄中    if (error)        goto Error;    /* notify platform of device entry */    if (platform_notify)        platform_notify(dev);    error = device_create_file(dev, &dev_attr_uevent); //在設(shè)備目錄下創(chuàng)建uevent文件    if (error)        goto attrError;    error = device_add_class_symlinks(dev); //為設(shè)備創(chuàng)建和類相關(guān)的符號鏈接    if (error)        goto SymlinkError;    error = device_add_attrs(dev); //為設(shè)備的默認(rèn)屬性添加對應(yīng)的文件    if (error)        goto AttrsError;    error = bus_add_device(dev);  //將device添加到bus_type    if (error)        goto BusError;    error = dpm_sysfs_add(dev);    if (error)        goto DPMError;    device_pm_add(dev);    if (MAJOR(dev->devt)) {        error = device_create_file(dev, &dev_attr_dev); //在設(shè)備目錄下創(chuàng)建dev屬性對應(yīng)文件,用于保存設(shè)備號        if (error)            goto DevAttrError;        error = device_create_sys_dev_entry(dev); //在/sys/block和/sys/char創(chuàng)建一個(gè)到設(shè)備所在目錄的符號鏈接        if (error)            goto SysEntryError;        devtmpfs_create_node(dev); //在/dev下創(chuàng)建設(shè)備文件    }    /* Notify clients of device addition.  This call must come     * after dpm_sysfs_add() and before kobject_uevent().     */    if (dev->bus)        blocking_notifier_call_chain(&dev->bus->p->bus_notifier,                         BUS_NOTIFY_ADD_DEVICE, dev);    kobject_uevent(&dev->kobj, KOBJ_ADD); //發(fā)送設(shè)備ADD事件    bus_probe_device(dev);  //嘗試將device綁定到device_driver    if (parent)  //如果指定了parent,將設(shè)備添加到parent的孩子鏈表中        klist_add_tail(&dev->p->knode_parent,                   &parent->p->klist_children);    if (dev->class) {  //如果設(shè)置了class,將設(shè)備添加到類的設(shè)備鏈表        mutex_lock(&dev->class->p->mutex);        /* tie the class to the device */        klist_add_tail(&dev->knode_class,                   &dev->class->p->klist_devices);        /* notify any interfaces that the device is here */        list_for_each_entry(class_intf,  //調(diào)用device所屬的class中所有class_interface的add_dev                    &dev->class->p->interfaces, node)            if (class_intf->add_dev)                class_intf->add_dev(dev, class_intf);        mutex_unlock(&dev->class->p->mutex);    }done:    put_device(dev);    return error; SysEntryError:    if (MAJOR(dev->devt))        device_remove_file(dev, &dev_attr_dev); DevAttrError:    device_pm_remove(dev);    dpm_sysfs_remove(dev); DPMError:    bus_remove_device(dev); BusError:    device_remove_attrs(dev); AttrsError:    device_remove_class_symlinks(dev); SymlinkError:    device_remove_file(dev, &dev_attr_uevent); attrError:    kobject_uevent(&dev->kobj, KOBJ_REMOVE);    kobject_del(&dev->kobj); Error:    cleanup_device_parent(dev);    put_device(parent);name_error:    kfree(dev->p);    dev->p = NULL;    goto done;}EXPORT_SYMBOL_GPL(device_add);

設(shè)備添加到系統(tǒng)主要流程都在device_add函數(shù)實(shí)現(xiàn),上面代碼的注釋基本把主要函數(shù)的作用進(jìn)行了描述。值得關(guān)注的一個(gè)函數(shù)便是bus_probe_device,該函數(shù)完成將設(shè)備綁定到驅(qū)動的動作。

void bus_probe_device(struct device *dev){    struct bus_type *bus = dev->bus;    struct subsys_interface *sif;    if (!bus)        return;    if (bus->p->drivers_autoprobe) //如果bus允許自動探測        device_initial_probe(dev); //主要功能    mutex_lock(&bus->p->mutex);    list_for_each_entry(sif, &bus->p->interfaces, node) //將設(shè)備綁定到接口        if (sif->add_dev)            sif->add_dev(dev, sif);    mutex_unlock(&bus->p->mutex);}void device_initial_probe(struct device *dev){    __device_attach(dev, true);}static int __device_attach(struct device *dev, bool allow_async){    int ret = 0;    device_lock(dev);    if (dev->driver) {  //指定了device所要綁定的driver        if (klist_node_attached(&dev->p->knode_driver)) { //檢查knode_driver是否綁定到鏈表            ret = 1;            goto out_unlock;        }        ret = device_bind_driver(dev); //綁定,修改相應(yīng)鏈表        if (ret == 0)            ret = 1;        else {            dev->driver = NULL;            ret = 0;        }    } else {  //沒有指定device要綁定的driver        struct device_attach_data data = {            .dev = dev,            .check_async = allow_async,            .want_async = false,        };        if (dev->parent)            pm_runtime_get_sync(dev->parent);        //遍歷bus中所有驅(qū)動,嘗試attach        ret = bus_for_each_drv(dev->bus, NULL, &data,                    __device_attach_driver);        if (!ret && allow_async && data.have_async) {            /*             * If we could not find appropriate driver             * synchronously and we are allowed to do             * async probes and there are drivers that             * want to probe asynchronously, we'll             * try them.             */            dev_dbg(dev, "scheduling asynchronous probe\n");            get_device(dev);            async_schedule(__device_attach_async_helper, dev);        } else {            pm_request_idle(dev);        }        if (dev->parent)            pm_runtime_put(dev->parent);    }out_unlock:    device_unlock(dev);    return ret;}

通過上面3個(gè)函數(shù)的追蹤,__device_attach函數(shù)遍歷bus所有的驅(qū)動,嘗試執(zhí)行attach,具體調(diào)用__device_attach_driver函數(shù)。

static int __device_attach_driver(struct device_driver *drv, void *_data){    struct device_attach_data *data = _data;    struct device *dev = data->dev;    bool async_allowed;    /*     * Check if device has already been claimed. This may     * happen with driver loading, device discovery/registration,     * and deferred probe processing happens all at once with     * multiple threads.     */    if (dev->driver)         return -EBUSY;    if (!driver_match_device(drv, dev))  //調(diào)用bus的match函數(shù),測試是否匹配        return 0;    //進(jìn)一步probe設(shè)備,需要設(shè)備已經(jīng)注冊    async_allowed = driver_allows_async_probing(drv);    if (async_allowed)        data->have_async = true;    //如果允許異步探測,則先返回    if (data->check_async && async_allowed != data->want_async)        return 0;    return driver_probe_device(drv, dev);}int driver_probe_device(struct device_driver *drv, struct device *dev){    int ret = 0;    if (!device_is_registered(dev)) //檢查device是否register        return -ENODEV;    pr_debug("bus: '%s': %s: matched device %s with driver %s\n",         drv->bus->name, __func__, dev_name(dev), drv->name);    if (dev->parent)        pm_runtime_get_sync(dev->parent);    pm_runtime_barrier(dev);    ret = really_probe(dev, drv); //真正執(zhí)行探測    pm_request_idle(dev);    if (dev->parent)        pm_runtime_put(dev->parent);    return ret;}

從上面兩個(gè)函數(shù)來看,真正執(zhí)行probe的函數(shù)是really_probe。

//返回1表示成功,返回0表示中間步驟出現(xiàn)異常,已回滾所有操作。static int really_probe(struct device *dev, struct device_driver *drv){    int ret = 0;    int local_trigger_count = atomic_read(&deferred_trigger_count);    atomic_inc(&probe_count);    pr_debug("bus: '%s': %s: probing driver %s with device %s\n",         drv->bus->name, __func__, drv->name, dev_name(dev));    WARN_ON(!list_empty(&dev->devres_head));    dev->driver = drv; //將設(shè)備的driver指向當(dāng)前驅(qū)動    /* If using pinctrl, bind pins now before probing */    ret = pinctrl_bind_pins(dev);    if (ret)        goto probe_failed;    if (driver_sysfs_add(dev)) {  //在sysfs驅(qū)動目錄中創(chuàng)建指向設(shè)備的符號鏈接,同時(shí)在設(shè)備目錄中創(chuàng)建指向驅(qū)動的符號鏈接        printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",            __func__, dev_name(dev));        goto probe_failed;    }    if (dev->pm_domain && dev->pm_domain->activate) {        ret = dev->pm_domain->activate(dev);        if (ret)            goto probe_failed;    }    /*     * Ensure devices are listed in devices_kset in correct order     * It's important to move Dev to the end of devices_kset before     * calling .probe, because it could be recursive and parent Dev     * should always go first     */    devices_kset_move_last(dev);    if (dev->bus->probe) {        ret = dev->bus->probe(dev); //優(yōu)先調(diào)用bus_type中的probe方法        if (ret)            goto probe_failed;    } else if (drv->probe) {        ret = drv->probe(dev);  //其次,調(diào)用driver中的probe方法        if (ret)            goto probe_failed;    }    pinctrl_init_done(dev);    if (dev->pm_domain && dev->pm_domain->sync)        dev->pm_domain->sync(dev);    driver_bound(dev); //將設(shè)備鏈入驅(qū)動的設(shè)備鏈表    ret = 1;    pr_debug("bus: '%s': %s: bound device %s to driver %s\n",         drv->bus->name, __func__, dev_name(dev), drv->name);    goto done;probe_failed:  //探測失敗, 回滾操作    devres_release_all(dev);    driver_sysfs_remove(dev);    dev->driver = NULL;    dev_set_drvdata(dev, NULL);    if (dev->pm_domain && dev->pm_domain->dismiss)        dev->pm_domain->dismiss(dev);    switch (ret) {    case -EPROBE_DEFER:        /* Driver requested deferred probing */        dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);        driver_deferred_probe_add(dev);        /* Did a trigger occur while probing? Need to re-trigger if yes */        if (local_trigger_count != atomic_read(&deferred_trigger_count))            driver_deferred_probe_trigger();        break;    case -ENODEV:    case -ENXIO:        pr_debug("%s: probe of %s rejects match %d\n",             drv->name, dev_name(dev), ret);        break;    default:        /* driver matched but the probe failed */        printk(KERN_WARNING               "%s: probe of %s failed with error %d\n",               drv->name, dev_name(dev), ret);    }    /*     * Ignore errors returned by ->probe so that the next driver can try     * its luck.     */    ret = 0;done:    atomic_dec(&probe_count);    wake_up(&probe_waitqueue);    return ret;}

到此,設(shè)備添加到系統(tǒng)的主要流程便基本清楚,不再往下跟蹤。

驅(qū)動

驅(qū)動數(shù)據(jù)結(jié)構(gòu)

struct device_driver {    const char      *name;     //driver名稱    struct bus_type     *bus;  //driver所屬的bus_type    struct module       *owner;    const char      *mod_name;  /* used for built-in modules */    bool suppress_bind_attrs;   /* disables bind/unbind via sysfs */    enum probe_type probe_type;    const struct of_device_id   *of_match_table;    const struct acpi_device_id *acpi_match_table;    int (*probe) (struct device *dev);  //在device綁定到driver之前,對device進(jìn)行初始化    int (*remove) (struct device *dev); //在device解綁到driver時(shí),回調(diào)    void (*shutdown) (struct device *dev);    int (*suspend) (struct device *dev, pm_message_t state);    int (*resume) (struct device *dev);    const struct attribute_group **groups; //driver的屬性    const struct dev_pm_ops *pm; //電源相關(guān)    struct driver_private *p;  //driver私有結(jié)構(gòu)};struct driver_private {    struct kobject kobj;    struct klist klist_devices;   //driver所支持的device鏈表    struct klist_node knode_bus;  //鏈入bus_type的驅(qū)動鏈表中    struct module_kobject *mkobj;    struct device_driver *driver;  //指向driver};

device_driver結(jié)構(gòu)中,bus域指向驅(qū)動所屬的總線類型,knode_bus域用于鏈入總線類型的驅(qū)動鏈表。driver_private結(jié)構(gòu)中的klist_devices域用于鏈接所有綁定到本驅(qū)動的設(shè)備。

驅(qū)動注冊與反注冊

驅(qū)動在加載時(shí),需要將其注冊到總線類型,調(diào)用driver_register實(shí)現(xiàn):

int driver_register(struct device_driver *drv){    int ret;    struct device_driver *other;    BUG_ON(!drv->bus->p); //確保bus已經(jīng)注冊到驅(qū)動模型中    //如果bus_type和driver都實(shí)現(xiàn)了同一個(gè)回調(diào),優(yōu)先使用bus_type的回調(diào)函數(shù),打印告警信息    if ((drv->bus->probe && drv->probe) ||        (drv->bus->remove && drv->remove) ||        (drv->bus->shutdown && drv->shutdown))        printk(KERN_WARNING "Driver '%s' needs updating - please use "            "bus_type methods\n", drv->name);    other = driver_find(drv->name, drv->bus); //根據(jù)名字查找驅(qū)動    if (other) {        printk(KERN_ERR "Error: Driver '%s' is already registered, "            "aborting...\n", drv->name);        return -EBUSY;    }    ret = bus_add_driver(drv); //將driver添加到bus    if (ret)        return ret;    ret = driver_add_groups(drv, drv->groups); //創(chuàng)建driver屬性文件    if (ret) {        bus_remove_driver(drv);        return ret;    }    kobject_uevent(&drv->p->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間    return ret;}EXPORT_SYMBOL_GPL(driver_register);

添加driver到bus_type,由bus_add_driver實(shí)現(xiàn):

int bus_add_driver(struct device_driver *drv){    struct bus_type *bus;    struct driver_private *priv;    int error = 0;    bus = bus_get(drv->bus);    if (!bus)        return -EINVAL;    pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name);    priv = kzalloc(sizeof(*priv), GFP_KERNEL);  //分配driver_private結(jié)構(gòu)空間    if (!priv) {        error = -ENOMEM;        goto out_put_bus;    }    klist_init(&priv->klist_devices, NULL, NULL); //初始化driver設(shè)備鏈表    priv->driver = drv; //關(guān)聯(lián)device_driver和driver_private    drv->p = priv;    priv->kobj.kset = bus->p->drivers_kset; //driver_private中的kobj的kset域指向subsys中的drivers_kset    error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,  //添加driver到sysfs                     "%s", drv->name);    if (error)        goto out_unregister;    klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers); //添加driver到bus的驅(qū)動鏈表中    if (drv->bus->p->drivers_autoprobe) {  //自動探測        if (driver_allows_async_probing(drv)) {  //允許異步執(zhí)行probe            pr_debug("bus: '%s': probing driver %s asynchronously\n",                drv->bus->name, drv->name);            async_schedule(driver_attach_async, drv); //異步probe        } else {            error = driver_attach(drv);  //同步probe            if (error)                goto out_unregister;        }    }    module_add_driver(drv->owner, drv);  //驅(qū)動實(shí)現(xiàn)的模塊    error = driver_create_file(drv, &driver_attr_uevent);  //在driver中添加uevent屬性文件    if (error) {        printk(KERN_ERR "%s: uevent attr (%s) failed\n",            __func__, drv->name);    }    error = driver_add_groups(drv, bus->drv_groups);  //添加driver的屬性文件    if (error) {        /* How the hell do we get out of this pickle? Give up */        printk(KERN_ERR "%s: driver_create_groups(%s) failed\n",            __func__, drv->name);    }    if (!drv->suppress_bind_attrs) {        error = add_bind_files(drv);  //在driver目錄添加的bind和unbind兩個(gè)屬性文件        if (error) {            /* Ditto */            printk(KERN_ERR "%s: add_bind_files(%s) failed\n",                __func__, drv->name);        }    }    return 0;out_unregister:    kobject_put(&priv->kobj);    kfree(drv->p);    drv->p = NULL;out_put_bus:    bus_put(bus);    return error;}

bus_add_driver函數(shù)完成驅(qū)動添加到總線類型,當(dāng)驅(qū)動添加完成后,如果總線類型設(shè)置了允許自動探測標(biāo)志drivers_autoprobe,便可以根據(jù)是否允許異步探測調(diào)用driver_attach_async或driver_attach,driver_attach_async也是調(diào)用driver_attach:

int driver_attach(struct device_driver *drv){    return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);}EXPORT_SYMBOL_GPL(driver_attach);static int __driver_attach(struct device *dev, void *data){    struct device_driver *drv = data;    /*     * Lock device and try to bind to it. We drop the error     * here and always return 0, because we need to keep trying     * to bind to devices and some drivers will return an error     * simply if it didn't support the device.     *     * driver_probe_device() will spit a warning if there     * is an error.     */    if (!driver_match_device(drv, dev)) //調(diào)用bus_type.match        return 0;    if (dev->parent)    /* Needed for USB */        device_lock(dev->parent);    device_lock(dev);    if (!dev->driver)        driver_probe_device(drv, dev); //完成probe的主要函數(shù)    device_unlock(dev);    if (dev->parent)        device_unlock(dev->parent);    return 0;}int driver_probe_device(struct device_driver *drv, struct device *dev){    int ret = 0;    if (!device_is_registered(dev)) //檢查device是否register        return -ENODEV;    pr_debug("bus: '%s': %s: matched device %s with driver %s\n",         drv->bus->name, __func__, dev_name(dev), drv->name);    if (dev->parent)        pm_runtime_get_sync(dev->parent);    pm_runtime_barrier(dev);    ret = really_probe(dev, drv); //真正執(zhí)行探測    pm_request_idle(dev);    if (dev->parent)        pm_runtime_put(dev->parent);    return ret;}

根據(jù)上面3個(gè)函數(shù),最終仍然是調(diào)用前面描述過的really_probe函數(shù)完成最后的探測。

到這里驅(qū)動注冊完成,結(jié)合之前的設(shè)備注冊流程,無論是驅(qū)動注冊或是設(shè)備注冊,只要總線類型設(shè)置了自動探測標(biāo)志位,這兩個(gè)流程都會執(zhí)行探測。所以設(shè)備發(fā)現(xiàn)與驅(qū)動的加載順序已經(jīng)不再重要,也是通過這種雙向探測方式,Linux內(nèi)核支持設(shè)備的熱拔插機(jī)制。

驅(qū)動卸載時(shí),需要調(diào)用driver_unregister函數(shù),使driver脫離總線類型:

void driver_unregister(struct device_driver *drv){    if (!drv || !drv->p) {        WARN(1, "Unexpected driver unregister!\n");        return;    }    driver_remove_groups(drv, drv->groups); //刪除驅(qū)動的屬性文件    bus_remove_driver(drv);                 //從總線類型中移除驅(qū)動}EXPORT_SYMBOL_GPL(driver_unregister);void bus_remove_driver(struct device_driver *drv){    if (!drv->bus)        return;    if (!drv->suppress_bind_attrs)        remove_bind_files(drv);   //刪除驅(qū)動目錄下bind和unbind文件    driver_remove_groups(drv, drv->bus->drv_groups); //刪除總線類型的驅(qū)動屬性文件    driver_remove_file(drv, &driver_attr_uevent);    //刪除驅(qū)動目錄下uevent文件    klist_remove(&drv->p->knode_bus); //從總線類型的驅(qū)動鏈表中移除驅(qū)動    pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name);    driver_detach(drv);  //驅(qū)動與所有綁定的設(shè)備進(jìn)行解綁    module_remove_driver(drv);  //驅(qū)動實(shí)現(xiàn)的模塊    kobject_put(&drv->p->kobj); //減少引用計(jì)數(shù)    bus_put(drv->bus);}

類數(shù)據(jù)結(jié)構(gòu)

struct class {    const char      *name;       //類名稱    struct module       *owner;  //指向?qū)崿F(xiàn)這個(gè)類的模塊的指針    struct class_attribute      *class_attrs;     //類公共屬性    const struct attribute_group    **dev_groups; //歸屬與該類的設(shè)備的默認(rèn)屬性    struct kobject          *dev_kobj;            //類鏈入sysfs的kobject    int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置類的特定環(huán)境變量    char *(*devnode)(struct device *dev, umode_t *mode); //創(chuàng)建設(shè)備時(shí),返回設(shè)備名稱    void (*class_release)(struct class *class); //類釋放時(shí)回調(diào)    void (*dev_release)(struct device *dev);    //設(shè)備釋放時(shí)回調(diào)    int (*suspend)(struct device *dev, pm_message_t state); //設(shè)備進(jìn)入睡眠狀態(tài)時(shí),回調(diào)    int (*resume)(struct device *dev);                      //設(shè)備被喚醒時(shí),回調(diào)    const struct kobj_ns_type_operations *ns_type;  //sysfs支持命名空間    const void *(*namespace)(struct device *dev);   //返回設(shè)備所在的命名空間    const struct dev_pm_ops *pm;  //電源相關(guān)    struct subsys_private *p;     //類所屬的子系統(tǒng)私有數(shù)據(jù)結(jié)構(gòu)};

類的私有數(shù)據(jù)類型與總線類型的私有數(shù)據(jù)類型都是subsys_private,這里將不再重復(fù)描述。

類注冊與反注冊

子系統(tǒng)需要使用類時(shí),需要調(diào)用class_register函數(shù)向總線類型注冊類:

#define class_register(class)           ({                          static struct lock_class_key __key;     __class_register(class, &__key);    })int __class_register(struct class *cls, struct lock_class_key *key){    struct subsys_private *cp;    int error;    pr_debug("device class '%s': registering\n", cls->name);    cp = kzalloc(sizeof(*cp), GFP_KERNEL); //分配私有數(shù)據(jù)空間    if (!cp)        return -ENOMEM;    klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put); //初始化該class的device鏈表    INIT_LIST_HEAD(&cp->interfaces);  //初始化接口鏈表    kset_init(&cp->glue_dirs);    __mutex_init(&cp->mutex, "subsys mutex", key);    error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name); //將在/sys/class/目錄下顯示該名稱    if (error) {        kfree(cp);        return error;    }    /* set the default /sys/dev directory for devices of this class */    if (!cls->dev_kobj)        cls->dev_kobj = sysfs_dev_char_kobj;#if defined(CONFIG_BLOCK)    /* let the block class directory show up in the root of sysfs */    if (!sysfs_deprecated || cls != &block_class)        cp->subsys.kobj.kset = class_kset;#else    cp->subsys.kobj.kset = class_kset;  // 全局變量class_kset指的是 /sys/class/#endif    cp->subsys.kobj.ktype = &class_ktype;    cp->class = cls;  //class與subsys_private關(guān)聯(lián)    cls->p = cp;    error = kset_register(&cp->subsys);  //在/sys/class/目錄下創(chuàng)建該類對應(yīng)的目錄    if (error) {        kfree(cp);        return error;    }    error = add_class_attrs(class_get(cls)); //在/sys/class/xxx/目錄下創(chuàng)建類屬性文件    class_put(cls);    return error;}EXPORT_SYMBOL_GPL(__class_register);

類的注冊比較簡單,注釋已經(jīng)比較詳細(xì)。當(dāng)子系統(tǒng)需要卸載類時(shí),需要調(diào)用class_register函數(shù):

void class_unregister(struct class *cls){    pr_debug("device class '%s': unregistering\n", cls->name);    remove_class_attrs(cls);            //刪除/sys/class/xxx/目錄下的類屬性文件    kset_unregister(&cls->p->subsys);   //刪除/sys/class/目錄}

接口

接口數(shù)據(jù)結(jié)構(gòu)

struct class_interface {    struct list_head    node;    //鏈入class中    struct class        *class;  //指向所屬的class     //在接口被添加或者設(shè)備被添加到接口所在的類時(shí),從接口中添加或刪除設(shè)備    int (*add_dev)      (struct device *, struct class_interface *);    void (*remove_dev)  (struct device *, struct class_interface *);};

接口注冊與反注冊

向類中注冊接口,需要調(diào)用class_interface_register函數(shù)完成:

int class_interface_register(struct class_interface *class_intf){    struct class *parent;    struct class_dev_iter iter;    struct device *dev;    if (!class_intf || !class_intf->class)  //確保class和class_interface都存在        return -ENODEV;    parent = class_get(class_intf->class); //增加引用計(jì)數(shù),并返回接口所屬的class    if (!parent)        return -EINVAL;    mutex_lock(&parent->p->mutex);    list_add_tail(&class_intf->node, &parent->p->interfaces); //將class_interface添加到class的接口鏈表    if (class_intf->add_dev) {  //如果接口設(shè)置了add_dev方法,對該class的所有device調(diào)用        class_dev_iter_init(&iter, parent, NULL, NULL);        while ((dev = class_dev_iter_next(&iter)))            class_intf->add_dev(dev, class_intf);  //接口方法作用于設(shè)備        class_dev_iter_exit(&iter);    }    mutex_unlock(&parent->p->mutex);    return 0;}

從類中刪除接口,需要調(diào)用class_interface_unregister函數(shù)完成:

void class_interface_unregister(struct class_interface *class_intf){    struct class *parent = class_intf->class;    struct class_dev_iter iter;    struct device *dev;    if (!parent)        return;    mutex_lock(&parent->p->mutex);    list_del_init(&class_intf->node); //將class_interface從class的接口鏈表中刪除    if (class_intf->remove_dev) { //如果接口設(shè)置了remove_dev方法,對該class的所有device調(diào)用        class_dev_iter_init(&iter, parent, NULL, NULL);        while ((dev = class_dev_iter_next(&iter)))            class_intf->remove_dev(dev, class_intf);  //接口方法作用于設(shè)備        class_dev_iter_exit(&iter);    }    mutex_unlock(&parent->p->mutex);    class_put(parent);}

基于設(shè)備驅(qū)動模型實(shí)現(xiàn)子系統(tǒng)

Linux設(shè)備驅(qū)動模型已經(jīng)將每種對象的關(guān)系,sysfs的呈現(xiàn)方式已經(jīng)實(shí)現(xiàn)了。實(shí)現(xiàn)子系統(tǒng)只需要定義業(yè)務(wù)自身的總線類型, 設(shè)備, 驅(qū)動, 類, 接口分別”繼承”bus_type, device, driver, class, class_interface。并根據(jù)具體業(yè)務(wù)實(shí)現(xiàn)各個(gè)結(jié)構(gòu)規(guī)定的回調(diào)函數(shù)。最后調(diào)用上述的注冊函數(shù)添加到系統(tǒng),便完成子系統(tǒng)的開發(fā)。


SCSI子系統(tǒng)之概述

Linux SCSI子系統(tǒng)的分層架構(gòu):

  • 低層:代表與SCSI的物理接口的實(shí)際驅(qū)動器,例如各個(gè)廠商為其特定的主機(jī)適配器(Host Bus Adapter, HBA)開發(fā)的驅(qū)動,低層驅(qū)動主要作用是發(fā)現(xiàn)連接到主機(jī)適配器的scsi設(shè)備,在內(nèi)存中構(gòu)建scsi子系統(tǒng)所需的數(shù)據(jù)結(jié)構(gòu),并提供消息傳遞接口,將scsi命令的接受與發(fā)送解釋為主機(jī)適配器的操作。

  • 高層: 代表各種scsi設(shè)備類型的驅(qū)動,如scsi磁盤驅(qū)動,scsi磁帶驅(qū)動,高層驅(qū)動認(rèn)領(lǐng)低層驅(qū)動發(fā)現(xiàn)的scsi設(shè)備,為這些設(shè)備分配名稱,將對設(shè)備的IO轉(zhuǎn)換為scsi命令,交由低層驅(qū)動處理。

  • 中層:包含scsi棧的公共服務(wù)函數(shù)。高層和低層通過調(diào)用中層的函數(shù)完成其功能,而中層在執(zhí)行過程中,也需要調(diào)用高層和低層注冊的回調(diào)函數(shù)做一些個(gè)性化處理。

Linux SCSI模型

Linux SCSI模型是內(nèi)核的抽象,主機(jī)適配器連接主機(jī)IO總線(如PCI總線)和存儲IO總線(如SCSI總線)。一臺計(jì)算機(jī)可以有多個(gè)主機(jī)適配器,而主機(jī)適配器可以控制一條或多條SCSI總線,一條總線可以有多個(gè)目標(biāo)節(jié)點(diǎn)與之相連,并且一個(gè)目標(biāo)節(jié)點(diǎn)可以有多個(gè)邏輯單元。

在Linux SCSI子系統(tǒng)中,內(nèi)核中的目標(biāo)節(jié)點(diǎn)(target)對應(yīng)SCSI磁盤,SCSI磁盤中可以有多個(gè)邏輯單元,統(tǒng)一由磁盤控制器控制,這些邏輯單元才是真正作為IO終點(diǎn)的存儲設(shè)備,內(nèi)核用設(shè)備(device)對邏輯單元進(jìn)行抽象;內(nèi)核中的Host對應(yīng)主機(jī)適配器(物理的HBA/RAID卡,虛擬的iscsi target)

內(nèi)核使用四元組 來唯一標(biāo)識一個(gè)scsi的邏輯單元,在sysfs中查看sda磁盤<2:0:0:0>顯示如下:

root@ubuntu16:/home/comet/Costor/bin# ls /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/alignment_offset  device             events_poll_msecs  integrity  removable  sda5    subsystembdi               discard_alignment  ext_range          power      ro         size    tracecapability        events             holders            queue      sda1       slaves  ueventdev               events_async       inflight           range      sda2       statroot@ubuntu16:/home/comet/Costor/bin# cat /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/dev8:0root@ubuntu16:/home/comet/Costor/bin# ll /dev/sdabrw-rw---- 1 root disk 8, 0 Sep 19 11:36 /dev/sda
  • host: 主機(jī)適配器的唯一編號。

  • channel: 主機(jī)適配器中scsi通道編號,由主機(jī)適配器固件維護(hù)。

  • id: 目標(biāo)節(jié)點(diǎn)唯一標(biāo)識符。

  • lun: 目標(biāo)節(jié)點(diǎn)內(nèi)邏輯單元編號。

SCSI命令

SCSI 命令是在 Command Descriptor Block (CDB) 中定義的。CDB 包含了用來定義要執(zhí)行的特定操作的操作代碼,以及大量特定于操作的參數(shù)。

命令用途
Test unit ready查詢設(shè)備是否已經(jīng)準(zhǔn)備好進(jìn)行傳輸
Inquiry請求設(shè)備基本信息
Request sense請求之前命令的錯(cuò)誤信息
Read capacity請求存儲容量信息
Read從設(shè)備讀取數(shù)據(jù)
Write向設(shè)備寫入數(shù)據(jù)
Mode sense請求模式頁面(設(shè)備參數(shù))
Mode select在模式頁面配置設(shè)備參數(shù)

借助大約 60 種可用命令,SCSI 可適用于許多設(shè)備(包括隨機(jī)存取設(shè)備,比如磁盤和像磁帶這樣的順序存儲設(shè)備)。SCSI 也提供了專門的命令以訪問箱體服務(wù)(比如存儲箱體內(nèi)部當(dāng)前的傳感和溫度)。

核心數(shù)據(jù)結(jié)構(gòu)

主機(jī)適配器模板scsi_host_template

主機(jī)適配器模板是相同型號主機(jī)適配器的公共內(nèi)容,包括請求隊(duì)列深度,SCSI命令處理回調(diào)函數(shù),錯(cuò)誤處理恢復(fù)函數(shù)。分配主機(jī)適配器結(jié)構(gòu)時(shí),需要使用主機(jī)適配器模板來賦值。在編寫SCSI低層驅(qū)動時(shí),第一步便是定義模板scsi_host_template,之后才能有模板生成主機(jī)適配器。

struct scsi_host_template {    struct module *module;  //指向使用該模板實(shí)現(xiàn)的scsi_host,低層驅(qū)動模塊。    const char *name;       //主機(jī)適配器名稱    int (* detect)(struct scsi_host_template *);    int (* release)(struct Scsi_Host *);    const char *(* info)(struct Scsi_Host *); //返回HBA相關(guān)信息,可選實(shí)現(xiàn)    int (* ioctl)(struct scsi_device *dev, int cmd, void __user *arg); //用戶空間ioctl函數(shù)的實(shí)現(xiàn),可選實(shí)現(xiàn)#ifdef CONFIG_COMPAT    //通過該函數(shù),支持32位系統(tǒng)的用戶態(tài)ioctl函數(shù)    int (* compat_ioctl)(struct scsi_device *dev, int cmd, void __user *arg);#endif    //將scsi命令放進(jìn)低層驅(qū)動的隊(duì)列,由中間層調(diào)用,必須實(shí)現(xiàn)    int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);    //以下5個(gè)函數(shù)是錯(cuò)誤處理回調(diào)函數(shù),由中間層按照嚴(yán)重程度調(diào)用    int (* eh_abort_handler)(struct scsi_cmnd *);        //Abort    int (* eh_device_reset_handler)(struct scsi_cmnd *); //Device Reset    int (* eh_target_reset_handler)(struct scsi_cmnd *); //Target Reset    int (* eh_bus_reset_handler)(struct scsi_cmnd *);    //Bus Reset    int (* eh_host_reset_handler)(struct scsi_cmnd *);   //Host Reset    //當(dāng)掃描到新磁盤時(shí)調(diào)用,中間層回調(diào)這個(gè)函數(shù)中可以分配和初始化低層驅(qū)動所需要的結(jié)構(gòu)    int (* slave_alloc)(struct scsi_device *)//在設(shè)備受到INQUIRY命令后,執(zhí)行相關(guān)的配置操作    int (* slave_configure)(struct scsi_device *);    //在scsi設(shè)備銷毀之前調(diào)用,中間層回調(diào)用于釋放slave_alloc分配的私有數(shù)據(jù)    void (* slave_destroy)(struct scsi_device *);    //當(dāng)發(fā)現(xiàn)新的target,中間層調(diào)用,用戶分配target私有數(shù)據(jù)    int (* target_alloc)(struct scsi_target *);    //在target被銷毀之前,中間層調(diào)用,低層驅(qū)動實(shí)現(xiàn),用于釋放target_alloc分配的數(shù)據(jù)    void (* target_destroy)(struct scsi_target *);    //需要自定義掃描target邏輯時(shí),中間層循環(huán)檢查返回值,直到該函數(shù)返回1,表示掃描完成    int (* scan_finished)(struct Scsi_Host *, unsigned long);    //需要自定義掃描target邏輯時(shí),掃描開始前回調(diào)    void (* scan_start)(struct Scsi_Host *);    //改變主機(jī)適配器的隊(duì)列深度,返回設(shè)置的隊(duì)列深度    int (* change_queue_depth)(struct scsi_device *, int);    //返回磁盤的BIOS參數(shù),如size, device, list (heads, sectors, cylinders)    int (* bios_param)(struct scsi_device *, struct block_device *,            sector_t, int []);    void (*unlock_native_capacity)(struct scsi_device *);    //在procfs中的讀寫操作回調(diào)    int (*show_info)(struct seq_file *, struct Scsi_Host *);    int (*write_info)(struct Scsi_Host *, char *, int);    //中間層發(fā)現(xiàn)scsi命令超時(shí)回調(diào)    enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);    //通過sysfs屬性reset主機(jī)適配器時(shí),回調(diào)    int (*host_reset)(struct Scsi_Host *shost, int reset_type);#define SCSI_ADAPTER_RESET  1#define SCSI_FIRMWARE_RESET 2    const char *proc_name; //在proc文件系統(tǒng)的名稱    struct proc_dir_entry *proc_dir;    int can_queue; //主機(jī)適配器能同時(shí)接受的命令數(shù)    int this_id;    /*     * This determines the degree to which the host adapter is capable     * of scatter-gather.     */  //聚散列表的參數(shù)    unsigned short sg_tablesize;    unsigned short sg_prot_tablesize;    /*     * Set this if the host adapter has limitations beside segment count.     */ //單個(gè)scsi命令能夠訪問的扇區(qū)最大數(shù)量    unsigned int max_sectors;    /*     * DMA scatter gather segment boundary limit. A segment crossing this     * boundary will be split in two.     */    unsigned long dma_boundary; //DMA聚散段邊界值,超過該值將被切割成兩個(gè)#define SCSI_DEFAULT_MAX_SECTORS    1024    short cmd_per_lun;    /*     * present contains counter indicating how many boards of this     * type were found when we did the scan.     */    unsigned char present;    /* If use block layer to manage tags, this is tag allocation policy */    int tag_alloc_policy;    /*     * Track QUEUE_FULL events and reduce queue depth on demand.     */    unsigned track_queue_depth:1;    /*     * This specifies the mode that a LLD supports.     */    unsigned supported_mode:2; //低層驅(qū)動支持的模式(initiator或target)    /*     * True if this host adapter uses unchecked DMA onto an ISA bus.     */    unsigned unchecked_isa_dma:1;    unsigned use_clustering:1;    /*     * True for emulated SCSI host adapters (e.g. ATAPI).     */    unsigned emulated:1;    /*     * True if the low-level driver performs its own reset-settle delays.     */    unsigned skip_settle_delay:1;    /* True if the controller does not support WRITE SAME */    unsigned no_write_same:1;    /*     * True if asynchronous aborts are not supported     */    unsigned no_async_abort:1;    /*     * Countdown for host blocking with no commands outstanding.     */    unsigned int max_host_blocked; //主機(jī)適配器發(fā)送隊(duì)列的低閥值,允許累計(jì)多個(gè)命令同時(shí)派發(fā)#define SCSI_DEFAULT_HOST_BLOCKED   7    /*     * Pointer to the sysfs class properties for this host, NULL terminated.     */    struct device_attribute **shost_attrs; //主機(jī)適配器類屬性    /*     * Pointer to the SCSI device properties for this host, NULL terminated.     */    struct device_attribute **sdev_attrs;  //主機(jī)適配器設(shè)備屬性    struct list_head legacy_hosts;    u64 vendor_id;    /*     * Additional per-command data allocated for the driver.     */  //scsi 命令緩沖池,scsi命令都是預(yù)先分配好的,保存在cmd_pool中    unsigned int cmd_size;    struct scsi_host_cmd_pool *cmd_pool;    /* temporary flag to disable blk-mq I/O path */    bool disable_blk_mq;  //禁用通用塊層多隊(duì)列模式標(biāo)志};

主機(jī)適配器Scsi_Host

Scsi_Host描述一個(gè)SCSI主機(jī)適配器,SCSI主機(jī)適配器通常是一塊基于PCI總線的擴(kuò)展卡或是一個(gè)SCSI控制器芯片。每個(gè)SCSI主機(jī)適配器可以存在多個(gè)通道,一個(gè)通道實(shí)際擴(kuò)展了一條SCSI總線。每個(gè)通過可以連接多個(gè)SCSI目標(biāo)節(jié)點(diǎn),具體連接數(shù)量與SCSI總線帶載能力有關(guān),或者受具體SCSI協(xié)議的限制。 真實(shí)的主機(jī)總線適配器是接入主機(jī)IO總線上(通常是PCI總線),在系統(tǒng)啟動時(shí),會掃描掛載在PCI總線上的設(shè)備,此時(shí)會分配主機(jī)總線適配器。 
Scsi_Host結(jié)構(gòu)包含內(nèi)嵌通用設(shè)備,將被鏈入SCSI總線類型(scsi_bus_type)的設(shè)備鏈表。

struct Scsi_Host {    struct list_head    __devices; //設(shè)備鏈表    struct list_head    __targets; //目標(biāo)節(jié)點(diǎn)鏈表    struct scsi_host_cmd_pool *cmd_pool; //scsi命令緩沖池    spinlock_t      free_list_lock;   //保護(hù)free_list    struct list_head    free_list; /* backup store of cmd structs, scsi命令預(yù)先分配的備用命令鏈表 */    struct list_head    starved_list; //scsi命令的饑餓鏈表    spinlock_t      default_lock;    spinlock_t      *host_lock;    struct mutex        scan_mutex;/* serialize scanning activity */    struct list_head    eh_cmd_q; //執(zhí)行錯(cuò)誤的scsi命令的鏈表    struct task_struct    * ehandler;  /* Error recovery thread. 錯(cuò)誤恢復(fù)線程 */    struct completion     * eh_action; /* Wait for specific actions on the                          host. */    wait_queue_head_t       host_wait; //scsi設(shè)備恢復(fù)等待隊(duì)列    struct scsi_host_template *hostt;  //主機(jī)適配器模板    struct scsi_transport_template *transportt; //指向SCSI傳輸層模板    /*     * Area to keep a shared tag map (if needed, will be     * NULL if not).     */    union {        struct blk_queue_tag    *bqt;        struct blk_mq_tag_set   tag_set; //SCSI支持多隊(duì)列時(shí)使用    };    //已經(jīng)派發(fā)給主機(jī)適配器(低層驅(qū)動)的scsi命令數(shù)    atomic_t host_busy;        /* commands actually active on low-level */    atomic_t host_blocked;  //阻塞的scsi命令數(shù)    unsigned int host_failed;      /* commands that failed.                          protected by host_lock */    unsigned int host_eh_scheduled;    /* EH scheduled without command */    unsigned int host_no;  /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. 系統(tǒng)內(nèi)唯一標(biāo)識 */    /* next two fields are used to bound the time spent in error handling */    int eh_deadline;    unsigned long last_reset; //記錄上次reset時(shí)間    /*     * These three parameters can be used to allow for wide scsi,     * and for host adapters that support multiple busses     * The last two should be set to 1 more than the actual max id     * or lun (e.g. 8 for SCSI parallel systems).     */    unsigned int max_channel; //主機(jī)適配器的最大通道編號    unsigned int max_id;      //主機(jī)適配器目標(biāo)節(jié)點(diǎn)最大編號    u64 max_lun;              //主機(jī)適配器lun最大編號    unsigned int unique_id;    /*     * The maximum length of SCSI commands that this host can accept.     * Probably 12 for most host adapters, but could be 16 for others.     * or 260 if the driver supports variable length cdbs.     * For drivers that don't set this field, a value of 12 is     * assumed.     */    unsigned short max_cmd_len;  //主機(jī)適配器可以接受的最長的SCSI命令    //下面這段在scsi_host_template中也有,由template中的字段賦值    int this_id;    int can_queue;    short cmd_per_lun;    short unsigned int sg_tablesize;    short unsigned int sg_prot_tablesize;    unsigned int max_sectors;    unsigned long dma_boundary;    /*     * In scsi-mq mode, the number of hardware queues supported by the LLD.     *     * Note: it is assumed that each hardware queue has a queue depth of     * can_queue. In other words, the total queue depth per host     * is nr_hw_queues * can_queue.     */    unsigned nr_hw_queues; //在scsi-mq模式中,低層驅(qū)動所支持的硬件隊(duì)列的數(shù)量    /*     * Used to assign serial numbers to the cmds.     * Protected by the host lock.     */    unsigned long cmd_serial_number;  //指向命令序列號unsigned active_mode:2;           //標(biāo)識是initiator或target    unsigned unchecked_isa_dma:1;    unsigned use_clustering:1;    /*     * Host has requested that no further requests come through for the     * time being.     */    unsigned host_self_blocked:1; //表示低層驅(qū)動要求阻塞該主機(jī)適配器,此時(shí)中間層不會繼續(xù)派發(fā)命令到主機(jī)適配器隊(duì)列中    /*     * Host uses correct SCSI ordering not PC ordering. The bit is     * set for the minority of drivers whose authors actually read     * the spec ;).     */    unsigned reverse_ordering:1;    /* Task mgmt function in progress */    unsigned tmf_in_progress:1;  //任務(wù)管理函數(shù)正在執(zhí)行    /* Asynchronous scan in progress */    unsigned async_scan:1;       //異步掃描正在執(zhí)行    /* Don't resume host in EH */    unsigned eh_noresume:1;      //在錯(cuò)誤處理過程不恢復(fù)主機(jī)適配器    /* The controller does not support WRITE SAME */    unsigned no_write_same:1;    unsigned use_blk_mq:1;       //是否使用SCSI多隊(duì)列模式    unsigned use_cmd_list:1;    /* Host responded with short (<36 bytes) INQUIRY result */    unsigned short_inquiry:1;    /*     * Optional work queue to be utilized by the transport     */    char work_q_name[20];  //被scsi傳輸層使用的工作隊(duì)列    struct workqueue_struct *work_q;    /*     * Task management function work queue     */    struct workqueue_struct *tmf_work_q; //任務(wù)管理函數(shù)工作隊(duì)列    /* The transport requires the LUN bits NOT to be stored in CDB[1] */    unsigned no_scsi2_lun_in_cdb:1;    /*     * Value host_blocked counts down from     */    unsigned int max_host_blocked; //在派發(fā)隊(duì)列中累計(jì)命令達(dá)到這個(gè)數(shù)值,才開始喚醒主機(jī)適配器    /* Protection Information */    unsigned int prot_capabilities;    unsigned char prot_guard_type;    /*     * q used for scsi_tgt msgs, async events or any other requests that     * need to be processed in userspace     */    struct request_queue *uspace_req_q; //需要在用戶空間處理的scsi_tgt消息、異步事件或其他請求的請求隊(duì)列    /* legacy crap */    unsigned long base;    unsigned long io_port;   //I/O端口編號    unsigned char n_io_port;    unsigned char dma_channel;    unsigned int  irq;    enum scsi_host_state shost_state; //狀態(tài)    /* ldm bits */ //shost_gendev: 內(nèi)嵌通用設(shè)備,SCSI設(shè)備通過這個(gè)域鏈入SCSI總線類型(scsi_bus_type)的設(shè)備鏈表    struct device       shost_gendev, shost_dev;    //shost_dev: 內(nèi)嵌類設(shè)備, SCSI設(shè)備通過這個(gè)域鏈入SCSI主機(jī)適配器類型(shost_class)的設(shè)備鏈表    /*     * List of hosts per template.     *     * This is only for use by scsi_module.c for legacy templates.     * For these access to it is synchronized implicitly by     * module_init/module_exit.     */    struct list_head sht_legacy_list;    /*     * Points to the transport data (if any) which is allocated     * separately     */    void *shost_data; //指向獨(dú)立分配的傳輸層數(shù)據(jù),由SCSI傳輸層使用    /*     * Points to the physical bus device we'd use to do DMA     * Needed just in case we have virtual hosts.     */    struct device *dma_dev;    /*     * We should ensure that this is aligned, both for better performance     * and also because some compilers (m68k) don't automatically force     * alignment to a long boundary.     */ //主機(jī)適配器專有數(shù)據(jù)    unsigned long hostdata[0]  /* Used for storage of host specific stuff */        __attribute__ ((aligned (sizeof(unsigned long))));};

目標(biāo)節(jié)點(diǎn)scsi_target

scsi_target結(jié)構(gòu)中有一個(gè)內(nèi)嵌驅(qū)動模型設(shè)備,被鏈入SCSI總線類型scsi_bus_type的設(shè)備鏈表。

struct scsi_target {    struct scsi_device  *starget_sdev_user; //指向正在進(jìn)行I/O的scsi設(shè)備,沒有IO則指向NULL    struct list_head    siblings;  //鏈入主機(jī)適配器target鏈表中    struct list_head    devices;   //屬于該target的device鏈表    struct device       dev;       //通用設(shè)備,用于加入設(shè)備驅(qū)動模型    struct kref     reap_ref; /* last put renders target invisible 本結(jié)構(gòu)的引用計(jì)數(shù) */    unsigned int        channel;   //該target所在的channel號    unsigned int        id; /* target id ... replace                     * scsi_device.id eventually */    unsigned int        create:1; /* signal that it needs to be added */    unsigned int        single_lun:1;   /* Indicates we should only                         * allow I/O to one of the luns                         * for the device at a time. */    unsigned int        pdt_1f_for_no_lun:1;    /* PDT = 0x1f                         * means no lun present. */    unsigned int        no_report_luns:1;   /* Don't use                         * REPORT LUNS for scanning. */    unsigned int        expecting_lun_change:1; /* A device has reported                         * a 3F/0E UA, other devices on                         * the same target will also. */    /* commands actually active on LLD. */    atomic_t        target_busy;    atomic_t        target_blocked;           //當(dāng)前阻塞的命令數(shù)    /*     * LLDs should set this in the slave_alloc host template callout.     * If set to zero then there is not limit.     */    unsigned int        can_queue;             //同時(shí)處理的命令數(shù)    unsigned int        max_target_blocked;    //阻塞命令數(shù)閥值#define SCSI_DEFAULT_TARGET_BLOCKED 3    char            scsi_level;                //支持的SCSI規(guī)范級別    enum scsi_target_state  state;             //target狀態(tài)    void            *hostdata; /* available to low-level driver */    unsigned long       starget_data[0]; /* for the transport SCSI傳輸層(中間層)使用 */    /* starget_data must be the last element!!!! */} __attribute__((aligned(sizeof(unsigned long))));

邏輯設(shè)備scsi_device

scsi_device描述scsi邏輯設(shè)備,代表scsi磁盤的邏輯單元lun。scsi_device描述符所代表的設(shè)備可能是另一臺存儲設(shè)備上的SATA/SAS/SCSI磁盤或SSD。操作系統(tǒng)在掃描到連接在主機(jī)適配器上的邏輯設(shè)備時(shí),創(chuàng)建scsi_device結(jié)構(gòu),用于scsi高層驅(qū)動和該設(shè)備通信。

struct scsi_device {    struct Scsi_Host *host;  //所歸屬的主機(jī)總線適配器    struct request_queue *request_queue; //請求隊(duì)列    /* the next two are protected by the host->host_lock */    struct list_head    siblings;   /* list of all devices on this host */ //鏈入主機(jī)總線適配器設(shè)備鏈表    struct list_head    same_target_siblings; /* just the devices sharing same target id */ //鏈入target的設(shè)備鏈表    atomic_t device_busy;       /* commands actually active on LLDD */    atomic_t device_blocked;    /* Device returned QUEUE_FULL. */    spinlock_t list_lock;    struct list_head cmd_list;  /* queue of in use SCSI Command structures */    struct list_head starved_entry; //鏈入主機(jī)適配器的"饑餓"鏈表    struct scsi_cmnd *current_cmnd; /* currently active command */ //當(dāng)前正在執(zhí)行的命令    unsigned short queue_depth; /* How deep of a queue we want */    unsigned short max_queue_depth; /* max queue depth */    unsigned short last_queue_full_depth; /* These two are used by */    unsigned short last_queue_full_count; /* scsi_track_queue_full() */    unsigned long last_queue_full_time; /* last queue full time */    unsigned long queue_ramp_up_period; /* ramp up period in jiffies */#define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ)    unsigned long last_queue_ramp_up;   /* last queue ramp up time */    unsigned int id, channel; //scsi_device所屬的target id和所在channel通道號    u64 lun;  //該設(shè)備的lun編號    unsigned int manufacturer;  /* Manufacturer of device, for using  制造商                     * vendor-specific cmd's */    unsigned sector_size;   /* size in bytes 硬件的扇區(qū)大小 */    void *hostdata;     /* available to low-level driver 專有數(shù)據(jù) */    char type;          //SCSI設(shè)備類型    char scsi_level;    //所支持SCSI規(guī)范的版本號,由INQUIRY命令獲得    char inq_periph_qual;   /* PQ from INQUIRY data */    unsigned char inquiry_len;  /* valid bytes in 'inquiry' */    unsigned char * inquiry;    /* INQUIRY response data */    const char * vendor;        /* [back_compat] point into 'inquiry' ... */    const char * model;     /* ... after scan; point to static string */    const char * rev;       /* ... "nullnullnullnull" before scan */#define SCSI_VPD_PG_LEN                255    int vpd_pg83_len;          //sense命令 0x83    unsigned char *vpd_pg83;    int vpd_pg80_len;          //sense命令 0x80    unsigned char *vpd_pg80;    unsigned char current_tag;  /* current tag */    struct scsi_target      *sdev_target;   /* used only for single_lun */    unsigned int    sdev_bflags; /* black/white flags as also found in                 * scsi_devinfo.[hc]. For now used only to                 * pass settings from slave_alloc to scsi                 * core. */    unsigned int eh_timeout; /* Error handling timeout */    unsigned removable:1;    unsigned changed:1; /* Data invalid due to media change */    unsigned busy:1;    /* Used to prevent races */    unsigned lockable:1;    /* Able to prevent media removal */    unsigned locked:1;      /* Media removal disabled */    unsigned borken:1;  /* Tell the Seagate driver to be                 * painfully slow on this device */    unsigned disconnect:1;  /* can disconnect */    unsigned soft_reset:1;  /* Uses soft reset option */    unsigned sdtr:1;    /* Device supports SDTR messages 支持同步數(shù)據(jù)傳輸 */    unsigned wdtr:1;    /* Device supports WDTR messages 支持16位寬數(shù)據(jù)傳輸*/    unsigned ppr:1;     /* Device supports PPR messages 支持PPR(并行協(xié)議請求)消息*/    unsigned tagged_supported:1;    /* Supports SCSI-II tagged queuing */    unsigned simple_tags:1; /* simple queue tag messages are enabled */    unsigned was_reset:1;   /* There was a bus reset on the bus for                 * this device */    unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN                     * because we did a bus reset. */    unsigned use_10_for_rw:1; /* first try 10-byte read / write */    unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */    unsigned no_report_opcodes:1;   /* no REPORT SUPPORTED OPERATION CODES */    unsigned no_write_same:1;   /* no WRITE SAME command */    unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */    unsigned skip_ms_page_8:1;  /* do not use MODE SENSE page 0x08 */    unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */    unsigned skip_vpd_pages:1;  /* do not read VPD pages */    unsigned try_vpd_pages:1;   /* attempt to read VPD pages */    unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */    unsigned no_start_on_add:1; /* do not issue start on add */    unsigned allow_restart:1; /* issue START_UNIT in error handler */    unsigned manage_start_stop:1;   /* Let HLD (sd) manage start/stop */    unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */    unsigned no_uld_attach:1; /* disable connecting to upper level drivers */    unsigned select_no_atn:1;    unsigned fix_capacity:1;    /* READ_CAPACITY is too high by 1 */    unsigned guess_capacity:1;  /* READ_CAPACITY might be too high by 1 */    unsigned retry_hwerror:1;   /* Retry HARDWARE_ERROR */    unsigned last_sector_bug:1; /* do not use multisector accesses on                       SD_LAST_BUGGY_SECTORS */    unsigned no_read_disc_info:1;   /* Avoid READ_DISC_INFO cmds */    unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */    unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */    unsigned is_visible:1;  /* is the device visible in sysfs */    unsigned wce_default_on:1;  /* Cache is ON by default */    unsigned no_dif:1;  /* T10 PI (DIF) should be disabled */    unsigned broken_fua:1;      /* Don't set FUA bit */    unsigned lun_in_cdb:1;      /* Store LUN bits in CDB[1] */    atomic_t disk_events_disable_depth; /* disable depth for disk events */    DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */    DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */    struct list_head event_list;    /* asserted events */    struct work_struct event_work;    unsigned int max_device_blocked; /* what device_blocked counts down from  */#define SCSI_DEFAULT_DEVICE_BLOCKED 3    atomic_t iorequest_cnt;    atomic_t iodone_cnt;    atomic_t ioerr_cnt;    struct device       sdev_gendev, //內(nèi)嵌通用設(shè)備, 鏈入scsi總線類型(scsi_bus_type)的設(shè)備鏈表                sdev_dev; //內(nèi)嵌類設(shè)備,鏈入scsi設(shè)備類(sdev_class)的設(shè)備鏈表    struct execute_work ew; /* used to get process context on put */    struct work_struct  requeue_work;    struct scsi_device_handler *handler; //自定義設(shè)備處理函數(shù)    void            *handler_data;    enum scsi_device_state sdev_state;  //scsi設(shè)備狀態(tài)    unsigned long       sdev_data[0];   //scsi傳輸層使用} __attribute__((aligned(sizeof(unsigned long))));

內(nèi)核定義的SCSI命令結(jié)構(gòu)scsi_cmnd

scsi_cmnd結(jié)構(gòu)有SCSI中間層創(chuàng)建,傳遞到SCSI低層驅(qū)動。每個(gè)IO請求會被創(chuàng)建一個(gè)scsi_cnmd,但scsi_cmnd并不一定是時(shí)IO請求。scsi_cmnd最終轉(zhuǎn)化成一個(gè)具體的SCSI命令。除了命令描述塊之外,scsi_cmnd包含更豐富的信息,包括數(shù)據(jù)緩沖區(qū)、感測數(shù)據(jù)緩沖區(qū)、完成回調(diào)函數(shù)以及所關(guān)聯(lián)的塊設(shè)備驅(qū)動層請求等,是SCSI中間層執(zhí)行SCSI命令的上下文。

struct scsi_cmnd {    struct scsi_device *device;  //指向命令所屬SCSI設(shè)備的描述符的指針    struct list_head list;  /* scsi_cmnd participates in queue lists 鏈入scsi設(shè)備的命令鏈表 */    struct list_head eh_entry; /* entry for the host eh_cmd_q */    struct delayed_work abort_work;    int eh_eflags;      /* Used by error handlr */    /*     * A SCSI Command is assigned a nonzero serial_number before passed     * to the driver's queue command function.  The serial_number is     * cleared when scsi_done is entered indicating that the command     * has been completed.  It is a bug for LLDDs to use this number     * for purposes other than printk (and even that is only useful     * for debugging).     */    unsigned long serial_number; //scsi命令的唯一序號    /*     * This is set to jiffies as it was when the command was first     * allocated.  It is used to time how long the command has     * been outstanding     */    unsigned long jiffies_at_alloc; //分配時(shí)的jiffies, 用于計(jì)算命令處理時(shí)間    int retries;  //命令重試次數(shù)    int allowed;  //允許的重試次數(shù)    unsigned char prot_op;    //保護(hù)操作(DIF和DIX)    unsigned char prot_type;  //DIF保護(hù)類型    unsigned char prot_flags;    unsigned short cmd_len;   //命令長度    enum dma_data_direction sc_data_direction;  //命令傳輸方向    /* These elements define the operation we are about to perform */    unsigned char *cmnd;  //scsi規(guī)范格式的命令字符串    /* These elements define the operation we ultimately want to perform */    struct scsi_data_buffer sdb;        //scsi命令數(shù)據(jù)緩沖區(qū)    struct scsi_data_buffer *prot_sdb;  //scsi命令保護(hù)信息緩沖區(qū)    unsigned underflow; /* Return error if less than                   this amount is transferred */    unsigned transfersize;  /* How much we are guaranteed to  //傳輸單位                   transfer with each SCSI transfer                   (ie, between disconnect /                   reconnects.   Probably == sector                   size */    struct request *request;    /* The command we are  通用塊層的請求描述符                       working on */#define SCSI_SENSE_BUFFERSIZE   96    unsigned char *sense_buffer;    //scsi命令感測數(shù)據(jù)緩沖區(qū)                /* obtained by REQUEST SENSE when                 * CHECK CONDITION is received on original                 * command (auto-sense) */    /* Low-level done function - can be used by low-level driver to point     *        to completion function.  Not used by mid/upper level code. */    void (*scsi_done) (struct scsi_cmnd *); //scsi命令在低層驅(qū)動完成時(shí),回調(diào)    /*     * The following fields can be written to by the host specific code.     * Everything else should be left alone.     */    struct scsi_pointer SCp;    /* Scratchpad used by some host adapters */    unsigned char *host_scribble;   /* The host adapter is allowed to                     * call scsi_malloc and get some memory                     * and hang it here.  The host adapter                     * is also expected to call scsi_free                     * to release this memory.  (The memory                     * obtained by scsi_malloc is guaranteed                     * to be at an address < 16Mb). */    int result;     /* Status code from lower level driver */    int flags;      /* Command flags */    unsigned char tag;  /* SCSI-II queued command tag */};

驅(qū)動scsi_driver

struct scsi_driver {    struct device_driver    gendrv;  // "繼承"device_driver    void (*rescan)(struct device *); //重新掃描前調(diào)用的回調(diào)函數(shù)    int (*init_command)(struct scsi_cmnd *);    void (*uninit_command)(struct scsi_cmnd *);    int (*done)(struct scsi_cmnd *);  //當(dāng)?shù)蛯域?qū)動完成一個(gè)scsi命令時(shí)調(diào)用,用于計(jì)算已經(jīng)完成的字節(jié)數(shù)    int (*eh_action)(struct scsi_cmnd *, int); //錯(cuò)誤處理回調(diào)};

設(shè)備模型

  • scsi_bus_type: scsi子系統(tǒng)總線類型

struct bus_type scsi_bus_type = {        .name       = "scsi",   // 對應(yīng)/sys/bus/scsi        .match      = scsi_bus_match,    .uevent     = scsi_bus_uevent,#ifdef CONFIG_PM    .pm     = &scsi_bus_pm_ops,#endif};EXPORT_SYMBOL_GPL(scsi_bus_type);
  • shost_class: scsi子系統(tǒng)類

static struct class shost_class = {    .name       = "scsi_host",  // 對應(yīng)/sys/class/scsi_host    .dev_release    = scsi_host_cls_release,};

初始化過程

操作系統(tǒng)啟動時(shí),會加載scsi子系統(tǒng),入口函數(shù)是init_scsi,使用subsys_initcall定義:

static int __init init_scsi(void){    int error;    error = scsi_init_queue();  //初始化聚散列表所需要的存儲池    if (error)        return error;    error = scsi_init_procfs(); //初始化procfs中與scsi相關(guān)的目錄項(xiàng)    if (error)        goto cleanup_queue;    error = scsi_init_devinfo();//設(shè)置scsi動態(tài)設(shè)備信息列表    if (error)        goto cleanup_procfs;    error = scsi_init_hosts();  //注冊shost_class類,在/sys/class/目錄下創(chuàng)建scsi_host子目錄    if (error)        goto cleanup_devlist;    error = scsi_init_sysctl(); //注冊SCSI系統(tǒng)控制表    if (error)        goto cleanup_hosts;    error = scsi_sysfs_register(); //注冊scsi_bus_type總線類型和sdev_class類    if (error)        goto cleanup_sysctl;    scsi_netlink_init(); //初始化SCSI傳輸netlink接口    printk(KERN_NOTICE "SCSI subsystem initialized\n");    return 0;cleanup_sysctl:    scsi_exit_sysctl();cleanup_hosts:    scsi_exit_hosts();cleanup_devlist:    scsi_exit_devinfo();cleanup_procfs:    scsi_exit_procfs();cleanup_queue:    scsi_exit_queue();    printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n",           -error);    return error;}

scsi_init_hosts函數(shù)初始化scsi子系統(tǒng)主機(jī)適配器所屬的類shost_class:

int scsi_init_hosts(void){    return class_register(&shost_class);}

scsi_sysfs_register函數(shù)初始化scsi子系統(tǒng)總線類型scsi_bus_type和設(shè)備所屬的類sdev_class類:

int scsi_sysfs_register(void){    int error;    error = bus_register(&scsi_bus_type);    if (!error) {        error = class_register(&sdev_class);        if (error)            bus_unregister(&scsi_bus_type);    }    return error;}

scsi低層驅(qū)動是面向主機(jī)適配器的,低層驅(qū)動被加載時(shí),需要添加主機(jī)適配器。主機(jī)適配器添加有兩種方式:1.在PCI子系統(tǒng)掃描掛載驅(qū)動時(shí)添加;2.手動方式添加。所有基于硬件PCI接口的主機(jī)適配器都采用第一種方式。添加主機(jī)適配器包括兩個(gè)步驟: 
1. 分別主機(jī)適配器數(shù)據(jù)結(jié)構(gòu)scsi_host_alloc 
2. 將主機(jī)適配器添加到系統(tǒng)scsi_add_host

struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize){    struct Scsi_Host *shost;    gfp_t gfp_mask = GFP_KERNEL;    if (sht->unchecked_isa_dma && privsize)        gfp_mask |= __GFP_DMA;    //一次分配Scsi_Host和私有數(shù)據(jù)空間    shost = kzalloc(sizeof(struct Scsi_Host) + privsize, gfp_mask);    if (!shost)        return NULL;    shost->host_lock = &shost->default_lock;    spin_lock_init(shost->host_lock);    shost->shost_state = SHOST_CREATED; //更新狀態(tài)    INIT_LIST_HEAD(&shost->__devices);  //初始化scsi設(shè)備鏈表    INIT_LIST_HEAD(&shost->__targets);  //初始化target鏈表    INIT_LIST_HEAD(&shost->eh_cmd_q);   //初始化執(zhí)行錯(cuò)誤的scsi命令鏈表    INIT_LIST_HEAD(&shost->starved_list);   //初始化scsi命令饑餓鏈表    init_waitqueue_head(&shost->host_wait);    mutex_init(&shost->scan_mutex);    /*     * subtract one because we increment first then return, but we need to     * know what the next host number was before increment     */ //遞增分配主機(jī)適配器號    shost->host_no = atomic_inc_return(&scsi_host_next_hn) - 1;    shost->dma_channel = 0xff;    /* These three are default values which can be overridden */    shost->max_channel = 0; //默認(rèn)通道號為0    shost->max_id = 8;      //默認(rèn)target最大數(shù)量    shost->max_lun = 8;     //默認(rèn)scsi_device最大數(shù)量    /* Give each shost a default transportt */    shost->transportt = &blank_transport_template;  //scsi傳輸層(中間層)模板    /*     * All drivers right now should be able to handle 12 byte     * commands.  Every so often there are requests for 16 byte     * commands, but individual low-level drivers need to certify that     * they actually do something sensible with such commands.     */    shost->max_cmd_len = 12;  //最長的SCSI命令長度    shost->hostt = sht;       //使用主機(jī)適配器模板    shost->this_id = sht->this_id;    shost->can_queue = sht->can_queue;    shost->sg_tablesize = sht->sg_tablesize;    shost->sg_prot_tablesize = sht->sg_prot_tablesize;    shost->cmd_per_lun = sht->cmd_per_lun;    shost->unchecked_isa_dma = sht->unchecked_isa_dma;    shost->use_clustering = sht->use_clustering;    shost->no_write_same = sht->no_write_same;    if (shost_eh_deadline == -1 || !sht->eh_host_reset_handler)        shost->eh_deadline = -1;    else if ((ulong) shost_eh_deadline * HZ > INT_MAX) {        shost_printk(KERN_WARNING, shost,                 "eh_deadline %u too large, setting to %u\n",                 shost_eh_deadline, INT_MAX / HZ);        shost->eh_deadline = INT_MAX;    } else        shost->eh_deadline = shost_eh_deadline * HZ;    if (sht->supported_mode == MODE_UNKNOWN) //由模板指定HBA的模式        /* means we didn't set it ... default to INITIATOR */        shost->active_mode = MODE_INITIATOR;  //主機(jī)適配器模式默認(rèn)是initiator    else        shost->active_mode = sht->supported_mode;    if (sht->max_host_blocked)        shost->max_host_blocked = sht->max_host_blocked;    else        shost->max_host_blocked = SCSI_DEFAULT_HOST_BLOCKED;    /*     * If the driver imposes no hard sector transfer limit, start at     * machine infinity initially.     */    if (sht->max_sectors)        shost->max_sectors = sht->max_sectors;    else        shost->max_sectors = SCSI_DEFAULT_MAX_SECTORS;    /*     * assume a 4GB boundary, if not set     */    if (sht->dma_boundary)        shost->dma_boundary = sht->dma_boundary;    else        shost->dma_boundary = 0xffffffff;  //默認(rèn)DMA的邊界為4G    shost->use_blk_mq = scsi_use_blk_mq && !shost->hostt->disable_blk_mq;    device_initialize(&shost->shost_gendev); //初始化主機(jī)適配器內(nèi)部通用設(shè)備    dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);    shost->shost_gendev.bus = &scsi_bus_type;   //設(shè)置主機(jī)適配器的總線類型    shost->shost_gendev.type = &scsi_host_type; //設(shè)置主機(jī)適配器的設(shè)備類型    device_initialize(&shost->shost_dev);    //初始化主機(jī)適配器的內(nèi)部類設(shè)備    shost->shost_dev.parent = &shost->shost_gendev; //內(nèi)部類設(shè)備的父設(shè)備設(shè)置為其內(nèi)部通用設(shè)備    shost->shost_dev.class = &shost_class;   //設(shè)置內(nèi)部類設(shè)備所屬的類是shost_class    dev_set_name(&shost->shost_dev, "host%d", shost->host_no);    shost->shost_dev.groups = scsi_sysfs_shost_attr_groups;  //設(shè)置類設(shè)備的屬性組    shost->ehandler = kthread_run(scsi_error_handler, shost,  //啟動主機(jī)適配器的錯(cuò)誤恢復(fù)內(nèi)核線程            "scsi_eh_%d", shost->host_no);    if (IS_ERR(shost->ehandler)) {        shost_printk(KERN_WARNING, shost,            "error handler thread failed to spawn, error = %ld\n",            PTR_ERR(shost->ehandler));        goto fail_kfree;    }    //分配任務(wù)管理工作隊(duì)列    shost->tmf_work_q = alloc_workqueue("scsi_tmf_%d",                        WQ_UNBOUND | WQ_MEM_RECLAIM,                       1, shost->host_no);    if (!shost->tmf_work_q) {        shost_printk(KERN_WARNING, shost,                 "failed to create tmf workq\n");        goto fail_kthread;    }    scsi_proc_hostdir_add(shost->hostt); //在procfs中添加主機(jī)適配器的目錄, eg. //創(chuàng)建/proc/scsi/<主機(jī)適配器名稱>目錄    return shost; fail_kthread:    kthread_stop(shost->ehandler); fail_kfree:    kfree(shost);    return NULL;}EXPORT_SYMBOL(scsi_host_alloc);
static inline int __must_check scsi_add_host(struct Scsi_Host *host,                         struct device *dev) //dev為父設(shè)備{    return scsi_add_host_with_dma(host, dev, dev);}int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,               struct device *dma_dev){    struct scsi_host_template *sht = shost->hostt;    int error = -EINVAL;    shost_printk(KERN_INFO, shost, "%s\n",            sht->info ? sht->info(shost) : sht->name);    if (!shost->can_queue) {        shost_printk(KERN_ERR, shost,                 "can_queue = 0 no longer supported\n");        goto fail;    }    if (shost_use_blk_mq(shost)) {         //如果主機(jī)適配器設(shè)置使用多隊(duì)列IO,則建立        error = scsi_mq_setup_tags(shost); //相應(yīng)的多隊(duì)列環(huán)境        if (error)            goto fail;    } else {        shost->bqt = blk_init_tags(shost->can_queue,                shost->hostt->tag_alloc_policy);        if (!shost->bqt) {            error = -ENOMEM;            goto fail;        }    }    /*     * Note that we allocate the freelist even for the MQ case for now,     * as we need a command set aside for scsi_reset_provider.  Having     * the full host freelist and one command available for that is a     * little heavy-handed, but avoids introducing a special allocator     * just for this.  Eventually the structure of scsi_reset_provider     * will need a major overhaul.     */ //分配存儲scsi命令和sense數(shù)據(jù)的緩沖區(qū), 并分配scsi命令的備用倉庫鏈表    error = scsi_setup_command_freelist(shost);    if (error)        goto out_destroy_tags;    //設(shè)置主機(jī)適配器的父設(shè)備,確定該設(shè)備在sysfs中的位置,通常會通過dev參數(shù)傳入pci_dev。    if (!shost->shost_gendev.parent)        shost->shost_gendev.parent = dev ? dev : &platform_bus; //如果dev為NULL,設(shè)置為platform_bus    if (!dma_dev)        dma_dev = shost->shost_gendev.parent;    shost->dma_dev = dma_dev;    error = device_add(&shost->shost_gendev);  //添加主機(jī)適配器通用設(shè)備到系統(tǒng)    if (error)        goto out_destroy_freelist;    pm_runtime_set_active(&shost->shost_gendev);    pm_runtime_enable(&shost->shost_gendev);    device_enable_async_suspend(&shost->shost_gendev); //支持異步掛起通用設(shè)備    scsi_host_set_state(shost, SHOST_RUNNING);  //設(shè)置主機(jī)適配器狀態(tài)    get_device(shost->shost_gendev.parent);     //增加通用父設(shè)備的引用計(jì)數(shù)    device_enable_async_suspend(&shost->shost_dev);  //支持異步掛起類設(shè)備    error = device_add(&shost->shost_dev);    //添加主機(jī)適配器類設(shè)備到系統(tǒng)    if (error)        goto out_del_gendev;    get_device(&shost->shost_gendev);    if (shost->transportt->host_size) {  //scsi傳輸層使用的數(shù)據(jù)空間        shost->shost_data = kzalloc(shost->transportt->host_size,                     GFP_KERNEL);        if (shost->shost_data == NULL) {            error = -ENOMEM;            goto out_del_dev;        }    }    if (shost->transportt->create_work_queue) {        snprintf(shost->work_q_name, sizeof(shost->work_q_name),             "scsi_wq_%d", shost->host_no);        shost->work_q = create_singlethread_workqueue( //分配被scsi傳輸層使用的工作隊(duì)列                    shost->work_q_name);        if (!shost->work_q) {            error = -EINVAL;            goto out_free_shost_data;        }    }    error = scsi_sysfs_add_host(shost); //添加主機(jī)適配器到子系統(tǒng)    if (error)        goto out_destroy_host;    scsi_proc_host_add(shost);  //在procfs添加主機(jī)適配器信息    return error; out_destroy_host:    if (shost->work_q)        destroy_workqueue(shost->work_q); out_free_shost_data:    kfree(shost->shost_data); out_del_dev:    device_del(&shost->shost_dev); out_del_gendev:    device_del(&shost->shost_gendev); out_destroy_freelist:    scsi_destroy_command_freelist(shost); out_destroy_tags:    if (shost_use_blk_mq(shost))        scsi_mq_destroy_tags(shost); fail:    return error;}EXPORT_SYMBOL(scsi_add_host_with_dma);

設(shè)備探測過程

在系統(tǒng)啟動過程中,會掃描默認(rèn)的PCI根總線,從而觸發(fā)了PCI設(shè)備掃描的過程,開始構(gòu)造PCI設(shè)備樹,SCSI主機(jī)適配器是掛載在PCI總線的設(shè)備。SCSI主機(jī)適配器做PCI設(shè)備會被PCI總線驅(qū)動層掃描到(PCI設(shè)備的掃描采用配置空間訪問的方式),掃描到SCSI主機(jī)適配器后,操作系統(tǒng)開始加載SCSI主機(jī)適配器驅(qū)動,SCSI主機(jī)適配器驅(qū)動就是上面所說的低層驅(qū)動。SCSI主機(jī)適配器驅(qū)動根據(jù)SCSI主機(jī)適配器驅(qū)動根據(jù)SCSI主機(jī)適配模板分配SCSI主機(jī)適配器描述符,并添加到系統(tǒng),之后啟動通過SCSI主機(jī)適配器擴(kuò)展出來的下一級總線–SCSI總線的掃描過程。

SCSI中間層依次以可能的ID和LUN構(gòu)造INQUIRY命令,之后將這些INQUIRY命令提交給塊IO子系統(tǒng),后者又最終將調(diào)用SCSI中間層的策略例程,再次提取到SCSI命令結(jié)構(gòu)后,調(diào)用SCSI低層驅(qū)動的queuecommand回調(diào)函數(shù)實(shí)現(xiàn)。 
對于給定ID的目標(biāo)節(jié)點(diǎn),如果它在SCSI總線上存在,那么它一定要實(shí)現(xiàn)對LUN0的INQUIRY響應(yīng)。也就是說,如果向某個(gè)ID的目標(biāo)節(jié)點(diǎn)的LUN0發(fā)送INQUIRY命令,或依次向各個(gè)LUN嘗試發(fā)送INQUIRY命令,檢查是否能收到響應(yīng),最終SCSI中間層能夠得到SCSI域中的所連接的邏輯設(shè)備及其信息。

SCSI總線具體的掃描方式可以由具體的主機(jī)適配器固件、主機(jī)適配器驅(qū)動實(shí)現(xiàn),在此只討論由主機(jī)適配器驅(qū)動調(diào)用scsi中間層提供通用的掃描函數(shù)的實(shí)現(xiàn)方式scsi_scan_host。

void scsi_scan_host(struct Scsi_Host *shost){    struct async_scan_data *data;    if (strncmp(scsi_scan_type, "none", 4) == 0) //檢查掃描邏輯        return;    if (scsi_autopm_get_host(shost) < 0)        return;    data = scsi_prep_async_scan(shost); //準(zhǔn)備異步掃描    if (!data) {        do_scsi_scan_host(shost);    //同步掃描        scsi_autopm_put_host(shost);        return;    }    /* register with the async subsystem so wait_for_device_probe()     * will flush this work     */    async_schedule(do_scan_async, data);  //異步掃描    /* scsi_autopm_put_host(shost) is called in scsi_finish_async_scan() */}EXPORT_SYMBOL(scsi_scan_host);

scsi_scan_host函數(shù)是scsi中間層提供的主機(jī)適配器掃描函數(shù),對于有主機(jī)適配器驅(qū)動有自定義掃描邏輯需求的可以設(shè)置主機(jī)適配器模板的回調(diào)函數(shù),由scsi_scan_host函數(shù)來調(diào)用回調(diào)實(shí)現(xiàn)自定義掃描。 
scsi_scan_type變量指定了掃描方式:async、sync、none。無論最終掃描方式是同步還是異步,都是由do_scsi_scan_host函數(shù)實(shí)現(xiàn):

static void do_scsi_scan_host(struct Scsi_Host *shost){    if (shost->hostt->scan_finished) {  //使用自定義掃描方式        unsigned long start = jiffies;        if (shost->hostt->scan_start)            shost->hostt->scan_start(shost); //自定義掃描開始回調(diào)        while (!shost->hostt->scan_finished(shost, jiffies - start)) //自定義掃描完成時(shí)返回1            msleep(10);    } else { //scsi子系統(tǒng)通用掃描函數(shù), SCAN_WILD_CARD表示掃描所有的target和device        scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD,                SCAN_WILD_CARD, 0);    }}

如果主機(jī)適配器模板設(shè)置了自定義掃描函數(shù),do_scsi_scan_host函數(shù)將會調(diào)用。如果沒有設(shè)置則使用默認(rèn)的掃描函數(shù)scsi_scan_host_selected執(zhí)行掃描。

int scsi_scan_host_selected(struct Scsi_Host *shost, unsigned int channel,                unsigned int id, u64 lun, int rescan){    SCSI_LOG_SCAN_BUS(3, shost_printk (KERN_INFO, shost,        "%s: <%u:%u:%llu>\n",        __func__, channel, id, lun));    //檢查channel、id、lun是否有效    if (((channel != SCAN_WILD_CARD) && (channel > shost->max_channel)) ||        ((id != SCAN_WILD_CARD) && (id >= shost->max_id)) ||        ((lun != SCAN_WILD_CARD) && (lun >= shost->max_lun)))        return -EINVAL;    mutex_lock(&shost->scan_mutex);    if (!shost->async_scan)        scsi_complete_async_scans();    //檢查Scsi_Host的狀態(tài)是否允許掃描    if (scsi_host_scan_allowed(shost) && scsi_autopm_get_host(shost) == 0) {        if (channel == SCAN_WILD_CARD)            for (channel = 0; channel <= shost->max_channel; //遍歷所有的channel進(jìn)行掃描                 channel++)                scsi_scan_channel(shost, channel, id, lun,  //掃描channel                          rescan);        else            scsi_scan_channel(shost, channel, id, lun, rescan); //掃描指定的channel        scsi_autopm_put_host(shost);    }    mutex_unlock(&shost->scan_mutex);    return 0;}

scsi_scan_host_selected函數(shù)掃描指定的主機(jī)適配器,根據(jù)輸入的參數(shù)決定是否遍歷掃描所有channel或掃描指定channel,通過函數(shù)scsi_scan_channel完成。

static void scsi_scan_channel(struct Scsi_Host *shost, unsigned int channel,                  unsigned int id, u64 lun, int rescan){    uint order_id;    if (id == SCAN_WILD_CARD)        for (id = 0; id < shost->max_id; ++id) {  //遍歷所有的target            /*             * XXX adapter drivers when possible (FCP, iSCSI)             * could modify max_id to match the current max,             * not the absolute max.             *             * XXX add a shost id iterator, so for example,             * the FC ID can be the same as a target id             * without a huge overhead of sparse id's.             */            if (shost->reverse_ordering)                /*                 * Scan from high to low id.                 */                order_id = shost->max_id - id - 1;            else                order_id = id;            __scsi_scan_target(&shost->shost_gendev, channel, //掃描指定的target                    order_id, lun, rescan);        }    else        __scsi_scan_target(&shost->shost_gendev, channel,                id, lun, rescan);}

__scsi_scan_target函數(shù)指定掃描target內(nèi)部的lun。

static void __scsi_scan_target(struct device *parent, unsigned int channel,        unsigned int id, u64 lun, int rescan){    struct Scsi_Host *shost = dev_to_shost(parent);    int bflags = 0;    int res;    struct scsi_target *starget;    if (shost->this_id == id)        /*         * Don't scan the host adapter         */        return;    //為指定的id分配target數(shù)據(jù)結(jié)構(gòu),并初始化    starget = scsi_alloc_target(parent, channel, id);    if (!starget)        return;    scsi_autopm_get_target(starget);    if (lun != SCAN_WILD_CARD) {        /*         * Scan for a specific host/chan/id/lun.         */ //掃描target中指定id的scsi_device(lun),并將scsi_device(lun)添加到子系統(tǒng)        scsi_probe_and_add_lun(starget, lun, NULL, NULL, rescan, NULL);        goto out_reap;    }    /*     * Scan LUN 0, if there is some response, scan further. Ideally, we     * would not configure LUN 0 until all LUNs are scanned.     */ //探測target的LUN0    res = scsi_probe_and_add_lun(starget, 0, &bflags, NULL, rescan, NULL);    if (res == SCSI_SCAN_LUN_PRESENT || res == SCSI_SCAN_TARGET_PRESENT) {        if (scsi_report_lun_scan(starget, bflags, rescan) != 0) //向target lun 0發(fā)送REPORT_LUNS            /*             * The REPORT LUN did not scan the target,             * do a sequential scan.             */            scsi_sequential_lun_scan(starget, bflags,  //探測REPORT_LUNS上報(bào)的lun                         starget->scsi_level, rescan);    } out_reap:    scsi_autopm_put_target(starget);    /*     * paired with scsi_alloc_target(): determine if the target has     * any children at all and if not, nuke it     */    scsi_target_reap(starget);    put_device(&starget->dev);}

掃描到target時(shí)分配并初始化scsi_target結(jié)構(gòu),scsi_probe_and_add_lun函數(shù)完成探測target中的lun,并將發(fā)現(xiàn)的lun添加到系統(tǒng)。

static int scsi_probe_and_add_lun(struct scsi_target *starget,                  u64 lun, int *bflagsp,                  struct scsi_device **sdevp, int rescan,                  void *hostdata){    struct scsi_device *sdev;    unsigned char *result;    int bflags, res = SCSI_SCAN_NO_RESPONSE, result_len = 256;    struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);    /*     * The rescan flag is used as an optimization, the first scan of a     * host adapter calls into here with rescan == 0.     */    sdev = scsi_device_lookup_by_target(starget, lun);  //尋找target中指定id的lun    if (sdev) {   //target中已經(jīng)存在lun        if (rescan || !scsi_device_created(sdev)) { //rescan參數(shù)要求重新掃描該lun            SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,                "scsi scan: device exists on %s\n",                dev_name(&sdev->sdev_gendev)));            if (sdevp)                *sdevp = sdev;            else                scsi_device_put(sdev);            if (bflagsp)                *bflagsp = scsi_get_device_flags(sdev,                                 sdev->vendor,                                 sdev->model);            return SCSI_SCAN_LUN_PRESENT;        }        scsi_device_put(sdev);    } else        sdev = scsi_alloc_sdev(starget, lun, hostdata); //target中不存在lun,分配scsi_device    if (!sdev)        goto out;    result = kmalloc(result_len, GFP_ATOMIC |            ((shost->unchecked_isa_dma) ? __GFP_DMA : 0));    if (!result)        goto out_free_sdev;    if (scsi_probe_lun(sdev, result, result_len, &bflags)) //發(fā)送INQUIRY到具體device,進(jìn)行探測        goto out_free_result;    if (bflagsp)        *bflagsp = bflags;    /*     * result contains valid SCSI INQUIRY data.     */    if (((result[0] >> 5) == 3) && !(bflags & BLIST_ATTACH_PQ3)) {        /*         * For a Peripheral qualifier 3 (011b), the SCSI         * spec says: The device server is not capable of         * supporting a physical device on this logical         * unit.         *         * For disks, this implies that there is no         * logical disk configured at sdev->lun, but there         * is a target id responding.         */        SCSI_LOG_SCAN_BUS(2, sdev_printk(KERN_INFO, sdev, "scsi scan:"                   " peripheral qualifier of 3, device not"                   " added\n"))        if (lun == 0) {            SCSI_LOG_SCAN_BUS(1, {                unsigned char vend[9];                unsigned char mod[17];                sdev_printk(KERN_INFO, sdev,                    "scsi scan: consider passing scsi_mod."                    "dev_flags=%s:%s:0x240 or 0x1000240\n",                    scsi_inq_str(vend, result, 8, 16),                    scsi_inq_str(mod, result, 16, 32));            });        }        res = SCSI_SCAN_TARGET_PRESENT;        goto out_free_result;    }    /*     * Some targets may set slight variations of PQ and PDT to signal     * that no LUN is present, so don't add sdev in these cases.     * Two specific examples are:     * 1) NetApp targets: return PQ=1, PDT=0x1f     * 2) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved"     *    in the UFI 1.0 spec (we cannot rely on reserved bits).     *     * References:     * 1) SCSI SPC-3, pp. 145-146     * PQ=1: "A peripheral device having the specified peripheral     * device type is not connected to this logical unit. However, the     * device server is capable of supporting the specified peripheral     * device type on this logical unit."     * PDT=0x1f: "Unknown or no device type"     * 2) USB UFI 1.0, p. 20     * PDT=00h Direct-access device (floppy)     * PDT=1Fh none (no FDD connected to the requested logical unit)     */    if (((result[0] >> 5) == 1 || starget->pdt_1f_for_no_lun) &&        (result[0] & 0x1f) == 0x1f &&        !scsi_is_wlun(lun)) {        SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,                    "scsi scan: peripheral device type"                    " of 31, no device added\n"));        res = SCSI_SCAN_TARGET_PRESENT;        goto out_free_result;    }    //添加scsi設(shè)備到子系統(tǒng)    res = scsi_add_lun(sdev, result, &bflags, shost->async_scan);    if (res == SCSI_SCAN_LUN_PRESENT) {        if (bflags & BLIST_KEY) {            sdev->lockable = 0;            scsi_unlock_floptical(sdev, result);        }    } out_free_result:    kfree(result); out_free_sdev:    if (res == SCSI_SCAN_LUN_PRESENT) {        if (sdevp) {            if (scsi_device_get(sdev) == 0) {                *sdevp = sdev;            } else {                __scsi_remove_device(sdev);                res = SCSI_SCAN_NO_RESPONSE;            }        }    } else        __scsi_remove_device(sdev); out:    return res;}

scsi_probe_and_add_lun函數(shù)由名字可知,完成lun的probe和add兩個(gè)操作: 
1. 探測邏輯設(shè)備scsi_probe_lun,發(fā)送INQUIRY命令到具體設(shè)備。 
2. 添加邏輯設(shè)備到系統(tǒng)scsi_add_lun,根據(jù)INQUIRY命令返回值添加lun到系統(tǒng)。

static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result,              int result_len, int *bflags){    unsigned char scsi_cmd[MAX_COMMAND_SIZE];    int first_inquiry_len, try_inquiry_len, next_inquiry_len;    int response_len = 0;    int pass, count, result;    struct scsi_sense_hdr sshdr;    *bflags = 0;    /* Perform up to 3 passes.  The first pass uses a conservative     * transfer length of 36 unless sdev->inquiry_len specifies a     * different value. */    first_inquiry_len = sdev->inquiry_len ? sdev->inquiry_len : 36;    try_inquiry_len = first_inquiry_len;    pass = 1; next_pass:    SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,                "scsi scan: INQUIRY pass %d length %d\n",                pass, try_inquiry_len));    /* Each pass gets up to three chances to ignore Unit Attention */    for (count = 0; count < 3; ++count) {        int resid;        memset(scsi_cmd, 0, 6);        scsi_cmd[0] = INQUIRY;      //命令類型是INQUIRY        scsi_cmd[4] = (unsigned char) try_inquiry_len;        memset(inq_result, 0, try_inquiry_len);        //發(fā)送SCSI命令,重試3次        result = scsi_execute_req(sdev,  scsi_cmd, DMA_FROM_DEVICE,                      inq_result, try_inquiry_len, &sshdr,                      HZ / 2 + HZ * scsi_inq_timeout, 3,                      &resid);        SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,                "scsi scan: INQUIRY %s with code 0x%x\n",                result ? "failed" : "successful", result));        if (result) {            /*             * not-ready to ready transition [asc/ascq=0x28/0x0]             * or power-on, reset [asc/ascq=0x29/0x0], continue.             * INQUIRY should not yield UNIT_ATTENTION             * but many buggy devices do so anyway.             */            if ((driver_byte(result) & DRIVER_SENSE) &&                scsi_sense_valid(&sshdr)) {                if ((sshdr.sense_key == UNIT_ATTENTION) &&                    ((sshdr.asc == 0x28) ||                     (sshdr.asc == 0x29)) &&                    (sshdr.ascq == 0))                    continue;            }        } else {            /*             * if nothing was transferred, we try             * again. It's a workaround for some USB             * devices.             */            if (resid == try_inquiry_len)                continue;        }        break;    }    if (result == 0) {        sanitize_inquiry_string(&inq_result[8], 8);        sanitize_inquiry_string(&inq_result[16], 16);        sanitize_inquiry_string(&inq_result[32], 4);        response_len = inq_result[4] + 5;        if (response_len > 255)            response_len = first_inquiry_len;   /* sanity */        /*         * Get any flags for this device.         *         * XXX add a bflags to scsi_device, and replace the         * corresponding bit fields in scsi_device, so bflags         * need not be passed as an argument.         */        *bflags = scsi_get_device_flags(sdev, &inq_result[8],                &inq_result[16]);        /* When the first pass succeeds we gain information about         * what larger transfer lengths might work. */        if (pass == 1) {            if (BLIST_INQUIRY_36 & *bflags)                next_inquiry_len = 36;            else if (BLIST_INQUIRY_58 & *bflags)                next_inquiry_len = 58;            else if (sdev->inquiry_len)                next_inquiry_len = sdev->inquiry_len;            else                next_inquiry_len = response_len;            /* If more data is available perform the second pass */            if (next_inquiry_len > try_inquiry_len) {                try_inquiry_len = next_inquiry_len;                pass = 2;                goto next_pass;            }        }    } else if (pass == 2) {        sdev_printk(KERN_INFO, sdev,                "scsi scan: %d byte inquiry failed.  "                "Consider BLIST_INQUIRY_36 for this device\n",                try_inquiry_len);        /* If this pass failed, the third pass goes back and transfers         * the same amount as we successfully got in the first pass. */        try_inquiry_len = first_inquiry_len;        pass = 3;        goto next_pass;    }    /* If the last transfer attempt got an error, assume the     * peripheral doesn't exist or is dead. */    if (result)        return -EIO;    /* Don't report any more data than the device says is valid */    sdev->inquiry_len = min(try_inquiry_len, response_len);    /*     * XXX Abort if the response length is less than 36? If less than     * 32, the lookup of the device flags (above) could be invalid,     * and it would be possible to take an incorrect action - we do     * not want to hang because of a short INQUIRY. On the flip side,     * if the device is spun down or becoming ready (and so it gives a     * short INQUIRY), an abort here prevents any further use of the     * device, including spin up.     *     * On the whole, the best approach seems to be to assume the first     * 36 bytes are valid no matter what the device says.  That's     * better than copying < 36 bytes to the inquiry-result buffer     * and displaying garbage for the Vendor, Product, or Revision     * strings.     */    if (sdev->inquiry_len < 36) {        if (!sdev->host->short_inquiry) {            shost_printk(KERN_INFO, sdev->host,                    "scsi scan: INQUIRY result too short (%d),"                    " using 36\n", sdev->inquiry_len);            sdev->host->short_inquiry = 1;        }        sdev->inquiry_len = 36;    }    /*     * Related to the above issue:     *     * XXX Devices (disk or all?) should be sent a TEST UNIT READY,     * and if not ready, sent a START_STOP to start (maybe spin up) and     * then send the INQUIRY again, since the INQUIRY can change after     * a device is initialized.     *     * Ideally, start a device if explicitly asked to do so.  This     * assumes that a device is spun up on power on, spun down on     * request, and then spun up on request.     */    /*     * The scanning code needs to know the scsi_level, even if no     * device is attached at LUN 0 (SCSI_SCAN_TARGET_PRESENT) so     * non-zero LUNs can be scanned.     */    sdev->scsi_level = inq_result[2] & 0x07;    if (sdev->scsi_level >= 2 ||        (sdev->scsi_level == 1 && (inq_result[3] & 0x0f) == 1))        sdev->scsi_level++;    sdev->sdev_target->scsi_level = sdev->scsi_level;    /*     * If SCSI-2 or lower, and if the transport requires it,     * store the LUN value in CDB[1].     */    sdev->lun_in_cdb = 0;    if (sdev->scsi_level <= SCSI_2 &&        sdev->scsi_level != SCSI_UNKNOWN &&        !sdev->host->no_scsi2_lun_in_cdb)        sdev->lun_in_cdb = 1;    return 0;}static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,        int *bflags, int async){    int ret;    /*     * XXX do not save the inquiry, since it can change underneath us,     * save just vendor/model/rev.     *     * Rather than save it and have an ioctl that retrieves the saved     * value, have an ioctl that executes the same INQUIRY code used     * in scsi_probe_lun, let user level programs doing INQUIRY     * scanning run at their own risk, or supply a user level program     * that can correctly scan.     */    /*     * Copy at least 36 bytes of INQUIRY data, so that we don't     * dereference unallocated memory when accessing the Vendor,     * Product, and Revision strings.  Badly behaved devices may set     * the INQUIRY Additional Length byte to a small value, indicating     * these strings are invalid, but often they contain plausible data     * nonetheless.  It doesn't matter if the device sent < 36 bytes     * total, since scsi_probe_lun() initializes inq_result with 0s.     */    sdev->inquiry = kmemdup(inq_result,                max_t(size_t, sdev->inquiry_len, 36),                GFP_ATOMIC);    if (sdev->inquiry == NULL)        return SCSI_SCAN_NO_RESPONSE;    sdev->vendor = (char *) (sdev->inquiry + 8); //第8個(gè)字節(jié)到第15個(gè)字節(jié)是vendor identification    sdev->model = (char *) (sdev->inquiry + 16); //第16個(gè)字節(jié)到第31個(gè)字節(jié)是product identification    sdev->rev = (char *) (sdev->inquiry + 32);   //第32個(gè)字節(jié)到第35個(gè)字節(jié)是product revision level    if (strncmp(sdev->vendor, "ATA     ", 8) == 0) {        /*         * sata emulation layer device.  This is a hack to work around         * the SATL power management specifications which state that         * when the SATL detects the device has gone into standby         * mode, it shall respond with NOT READY.         */        sdev->allow_restart = 1;    }    if (*bflags & BLIST_ISROM) {        sdev->type = TYPE_ROM;        sdev->removable = 1;    } else {        sdev->type = (inq_result[0] & 0x1f);        sdev->removable = (inq_result[1] & 0x80) >> 7;        /*         * some devices may respond with wrong type for         * well-known logical units. Force well-known type         * to enumerate them correctly.         */        if (scsi_is_wlun(sdev->lun) && sdev->type != TYPE_WLUN) {            sdev_printk(KERN_WARNING, sdev,                "%s: correcting incorrect peripheral device type 0x%x for W-LUN 0x%16xhN\n",                __func__, sdev->type, (unsigned int)sdev->lun);            sdev->type = TYPE_WLUN;        }    }    if (sdev->type == TYPE_RBC || sdev->type == TYPE_ROM) {        /* RBC and MMC devices can return SCSI-3 compliance and yet         * still not support REPORT LUNS, so make them act as         * BLIST_NOREPORTLUN unless BLIST_REPORTLUN2 is         * specifically set */        if ((*bflags & BLIST_REPORTLUN2) == 0)            *bflags |= BLIST_NOREPORTLUN;    }    /*     * For a peripheral qualifier (PQ) value of 1 (001b), the SCSI     * spec says: The device server is capable of supporting the     * specified peripheral device type on this logical unit. However,     * the physical device is not currently connected to this logical     * unit.     *     * The above is vague, as it implies that we could treat 001 and     * 011 the same. Stay compatible with previous code, and create a     * scsi_device for a PQ of 1     *     * Don't set the device offline here; rather let the upper     * level drivers eval the PQ to decide whether they should     * attach. So remove ((inq_result[0] >> 5) & 7) == 1 check.     */    sdev->inq_periph_qual = (inq_result[0] >> 5) & 7;    sdev->lockable = sdev->removable;    sdev->soft_reset = (inq_result[7] & 1) && ((inq_result[3] & 7) == 2);    if (sdev->scsi_level >= SCSI_3 ||            (sdev->inquiry_len > 56 && inq_result[56] & 0x04))        sdev->ppr = 1;    if (inq_result[7] & 0x60)        sdev->wdtr = 1;    if (inq_result[7] & 0x10)        sdev->sdtr = 1;    sdev_printk(KERN_NOTICE, sdev, "%s %.8s %.16s %.4s PQ: %d "            "ANSI: %d%s\n", scsi_device_type(sdev->type),            sdev->vendor, sdev->model, sdev->rev,            sdev->inq_periph_qual, inq_result[2] & 0x07,            (inq_result[3] & 0x0f) == 1 ? " CCS" : "");    if ((sdev->scsi_level >= SCSI_2) && (inq_result[7] & 2) &&        !(*bflags & BLIST_NOTQ)) {        sdev->tagged_supported = 1;        sdev->simple_tags = 1;    }    /*     * Some devices (Texel CD ROM drives) have handshaking problems     * when used with the Seagate controllers. borken is initialized     * to 1, and then set it to 0 here.     */    if ((*bflags & BLIST_BORKEN) == 0)        sdev->borken = 0;    if (*bflags & BLIST_NO_ULD_ATTACH)        sdev->no_uld_attach = 1;    /*     * Apparently some really broken devices (contrary to the SCSI     * standards) need to be selected without asserting ATN     */    if (*bflags & BLIST_SELECT_NO_ATN)        sdev->select_no_atn = 1;    /*     * Maximum 512 sector transfer length     * broken RA4x00 Compaq Disk Array     */    if (*bflags & BLIST_MAX_512)        blk_queue_max_hw_sectors(sdev->request_queue, 512);    /*     * Max 1024 sector transfer length for targets that report incorrect     * max/optimal lengths and relied on the old block layer safe default     */    else if (*bflags & BLIST_MAX_1024)        blk_queue_max_hw_sectors(sdev->request_queue, 1024);    /*     * Some devices may not want to have a start command automatically     * issued when a device is added.     */    if (*bflags & BLIST_NOSTARTONADD)        sdev->no_start_on_add = 1;    if (*bflags & BLIST_SINGLELUN)        scsi_target(sdev)->single_lun = 1;    sdev->use_10_for_rw = 1;    if (*bflags & BLIST_MS_SKIP_PAGE_08)        sdev->skip_ms_page_8 = 1;    if (*bflags & BLIST_MS_SKIP_PAGE_3F)        sdev->skip_ms_page_3f = 1;    if (*bflags & BLIST_USE_10_BYTE_MS)        sdev->use_10_for_ms = 1;    /* some devices don't like REPORT SUPPORTED OPERATION CODES     * and will simply timeout causing sd_mod init to take a very     * very long time */    if (*bflags & BLIST_NO_RSOC)        sdev->no_report_opcodes = 1;    /* set the device running here so that slave configure     * may do I/O */    ret = scsi_device_set_state(sdev, SDEV_RUNNING); //狀態(tài)    if (ret) {        ret = scsi_device_set_state(sdev, SDEV_BLOCK);        if (ret) {            sdev_printk(KERN_ERR, sdev,                    "in wrong state %s to complete scan\n",                    scsi_device_state_name(sdev->sdev_state));            return SCSI_SCAN_NO_RESPONSE;        }    }    if (*bflags & BLIST_MS_192_BYTES_FOR_3F)        sdev->use_192_bytes_for_3f = 1;    if (*bflags & BLIST_NOT_LOCKABLE)        sdev->lockable = 0;    if (*bflags & BLIST_RETRY_HWERROR)        sdev->retry_hwerror = 1;    if (*bflags & BLIST_NO_DIF)        sdev->no_dif = 1;    sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;    if (*bflags & BLIST_TRY_VPD_PAGES)        sdev->try_vpd_pages = 1;    else if (*bflags & BLIST_SKIP_VPD_PAGES)        sdev->skip_vpd_pages = 1;    transport_configure_device(&sdev->sdev_gendev); //把lun配置到scsi傳輸層    if (sdev->host->hostt->slave_configure) {        ret = sdev->host->hostt->slave_configure(sdev); //主機(jī)適配器模板設(shè)置的回調(diào),對scsi_device(lun)執(zhí)行特定的初始化        if (ret) {            /*             * if LLDD reports slave not present, don't clutter             * console with alloc failure messages             */            if (ret != -ENXIO) {                sdev_printk(KERN_ERR, sdev,                    "failed to configure device\n");            }            return SCSI_SCAN_NO_RESPONSE;        }    }    if (sdev->scsi_level >= SCSI_3)        scsi_attach_vpd(sdev);    sdev->max_queue_depth = sdev->queue_depth;  //設(shè)置最大隊(duì)列深度    /*     * Ok, the device is now all set up, we can     * register it and tell the rest of the kernel     * about it.     */ //添加scsi_device(lun)到sysfs    if (!async && scsi_sysfs_add_sdev(sdev) != 0)        return SCSI_SCAN_NO_RESPONSE;    return SCSI_SCAN_LUN_PRESENT;}
本站僅提供存儲服務(wù),所有內(nèi)容均由用戶發(fā)布,如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請點(diǎn)擊舉報(bào)
打開APP,閱讀全文并永久保存 查看更多類似文章
猜你喜歡
類似文章
Linux 2.6內(nèi)核的設(shè)備模型
Linux設(shè)備模型 - 鍵盤棒棒的日志 - 網(wǎng)易博客
我對linux理解之driver_register
驅(qū)動的初始化
面試如果被問到Linux設(shè)備驅(qū)動模型怎么答?看完這篇就能給出滿意答案了
linux內(nèi)核部件分析(十)——設(shè)備驅(qū)動模型之class
更多類似文章 >>
生活服務(wù)
分享 收藏 導(dǎo)長圖 關(guān)注 下載文章
綁定賬號成功
后續(xù)可登錄賬號暢享VIP特權(quán)!
如果VIP功能使用有故障,
可點(diǎn)擊這里聯(lián)系客服!

聯(lián)系客服