--- include/linux/topology.h Mon Apr 9 12:43:37 2001 +++ include/linux/topology.h Mon Apr 9 11:30:39 2001 @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2001 Silicon Graphics, Inc. + * Copyright (C) 2001 Kanoj Sarcar (kanoj@sgi.com) + * + * This file deals with exposing as much of the machine structure + * as sophisticated users might need to optimize application performance. + */ + +#ifndef _LINUX_TOPOLOGY_H +#define _LINUX_TOPOLOGY_H + +#define MACHSTRING "machine" +#define VERSTRING "version" +#define NODESTRING "node" +#define CPUSTRING "cpu" +#define MEMSTRING "memory" +#define MSIZESTRING "memsize" +#define SHCACHSTRING "shcach" +#define DISTSTRING "distances" + +#define MAXSTRINGLEN 20 +#define MAXNUMBERLEN 5 +#define MAXNUMBER 9999 + +/* + * Consider this the user level manpage for the topology graph: + * + * The graph _must_ always be searched recursively. Additional links + * (whether generic, or platform specific) might be added or deleted + * without notice. Portable user programs that look at this graph must + * look for certain known names in heirarchical order. + * + * The topology graph is rooted at /proc/machine. Platforms can link + * this point into their own topology graph implementations, if they + * follow the rules below. Else, they risk being incompatible with + * all other Linux platforms. + * + * Programs should read the "version" vertex under the root to + * determine what version of the graph they are looking at. The + * following describes version 0 of the graph. Future platforms + * might lead to changes in the graph description, and those will + * be marked by newer versions of the graph. + * + * Some levels under the root, there will be, possibly multiple, + * "nodeABCD" vertices. ABCD is the logical node number. This means + * the machine is composed of these nodes. + * + * Some levels under "nodeABCD" vertex, there may be, possibly multiple, + * "cpuPQRS" vertices. PQRS is the logical cpu number. The node has + * all these cpus. + * + * Some levels under "nodeABCD" vertex, there may be a "memoryEFGH" + * vertex. If there is, some levels under "memoryEFGH" vertex, there + * will be a "memsize" vertex. The number obtained by cat'ing this + * vertex represents the total memory on the node. + * + * Some levels under "nodeABCD" vertex, there will be a vertex + * named "distances". Cat'ing this file will give distances from + * this node to all others. These distances loosely indicate how far + * from each other the nodes are, and how fat the links between the + * nodes are. + * + * Some levels under "cpuPQRS" and "cpuWXYZ" vertices, possible + * cache sharing is indicated by the presence of the same vertex + * name "shcachKLMN". + * + * At any point in the graph, if there is a vertex with a name that is + * not mentioned above, it is a platform specific component. Platform + * vendors define these names and the tools/drivers to utilize them. + * + * Whenever a vertex needs to be cat'ed to obtain information, the + * kernel sends back information via streams of ascii characters. This + * stream should be parsed with the knowledge that string seperators + * (like \n, \t etc) can be replaced with others for better + * presentability. + * + * No other rules should be assumed by users looking at the graph. + * If any other rules are assumed, program portability across platforms + * and compatibility across releases is not guaranteed. + * + * Example: find /proc/machine -name "*" -print may yield the line + * /proc/machine/......./node0003/......./cpu0007. This indicates + * node 3 contains cpu 7. If it also yields the output + * /proc/machine/..../node0002/.../memory0002/memsize, it indicates + * node 2 has some memory, whose size can be obtained by cat'ing the + * "memsize" vertex. If it also yields the line + * /proc/machine/.../node0001/.../platreg, then "platreg" represents + * a platform specific component on node 1 that the platform wants to + * expose to users for some reason. + */ + +/* + * Kernel interfaces for arch/platform code. On success, the interfaces + * return 0, on failure, they return 1. + */ + +typedef struct proc_dir_entry * topo_vertex_t; + +/* + * Returns root of the topology graph. Input determines which version + * of the graph the platform wants to use. + */ +extern topo_vertex_t topo_init(int); + +/* + * Add the (first parameter) node to the topology graph at the (second + * parameter) add point. If the addpoint is 0, the node is added + * directly under the topology graph root. + */ +extern int topo_node_add(node_data_t *, topo_vertex_t); + +/* + * Add the (second parameter) cpunumber to the topology graph at the + * (third parameter) add point. If the addpoint is 0, the cpu is added + * directly under the topology vertex for the (first parameter) node. + */ +extern int topo_cpu_add(node_data_t *, unsigned int, topo_vertex_t); + +/* + * Add the memory vertices for the (first parameter) node to the + * (seconed parameter) add point. If the addpoint is 0, the memory + * heirarchy is added directly under the topology vertex for the node. + */ +extern int topo_mem_add(node_data_t *, topo_vertex_t); + +/* + * Indicates that the first and second components share a resource, + * which is currently only a level of cache. + */ +extern int topo_shared_cache_add(topo_vertex_t, topo_vertex_t); + +#endif /* _LINUX_TOPOLOGY_H */ --- kernel/topology.c Mon Apr 9 12:43:37 2001 +++ kernel/topology.c Mon Apr 9 12:01:04 2001 @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2001 Silicon Graphics, Inc. + * Copyright (C) 2001 Kanoj Sarcar (kanoj@sgi.com) + * + * This file deals with exposing as much of the machine structure + * as sophisticated users might need to optimize application performance. + * + * Implementation currently uses procfs, but could be converted to + * use devfs, or some other simple fs. Proc fs suffers from not having + * file links, which are needed for shared resources (caches, routers etc). + */ + +#include +#include +#include +#include +#include + +extern void free_proc_entry(struct proc_dir_entry *); + +static topo_vertex_t mach_root; + +static void num_to_string(unsigned int number, char *str) +{ + int index = MAXNUMBERLEN - 2; + + if (number > MAXNUMBER) + printk("WARNING: node/cpu number exceeded ... %d\n", number); + str[index + 1] = 0; + while (number) { + str[index] = '0' + (number % 10); + number /= 10; + index--; + } + while (index >= 0) + str[index--] = '0'; +} + +static void fillstring(char *target, char *string, int number) +{ + char numstr[MAXNUMBERLEN]; + + strcpy(target, string); + num_to_string(number, numstr); + strcat(target, numstr); +} + +/* + * This must be called before any other functions in this file + * are invoked. + */ +topo_vertex_t topo_init(int version) +{ + char versname[MAXSTRINGLEN]; + + mach_root = proc_mkdir(MACHSTRING, 0); + if (mach_root) { + fillstring(versname, VERSTRING, version); + proc_mkdir(versname, mach_root); + } + return(mach_root); +} + +static int read_distances(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int i, len = 0; + node_data_t *ndat = (node_data_t *)data; + + for (i = 0; i < numnodes; i++, page += 13) + len += sprintf(page, "Node %3d: %2d\n", i, ndat->distances[i]); + return len; +} + +int topo_node_add(node_data_t *ndat, topo_vertex_t addpoint) +{ + char nodename[MAXSTRINGLEN]; + struct proc_dir_entry *entry; + + fillstring(nodename, NODESTRING, ndat->node_id); + if (addpoint == (topo_vertex_t)0) + addpoint = mach_root; + if ((ndat->nodeinfo = (void *)proc_mkdir(nodename, mach_root))) { + entry = create_proc_entry(DISTSTRING, S_IRUGO, ndat->nodeinfo); + if (!entry) { + free_proc_entry((struct proc_dir_entry *)(ndat->nodeinfo)); + ndat->nodeinfo = 0; + return 1; + } + entry->nlink = 1; + entry->data = (void *)ndat; + entry->read_proc = read_distances; + return 0; + } + return 1; +} + +/* + * This module does not manage cpu numbers. + */ +int topo_cpu_add(node_data_t *ndat, unsigned int cpunum, topo_vertex_t addpoint) +{ + int i; + char cpuname[MAXSTRINGLEN]; + struct proc_dir_entry *entry; + + fillstring(cpuname, CPUSTRING, cpunum); + if (addpoint == (topo_vertex_t)0) + addpoint = (topo_vertex_t)(ndat->nodeinfo); + entry = proc_mkdir(cpuname, addpoint); + if (!entry) + return 1; + for (i = 0; i < MAX_CPUS_PER_NODE; i++) + if (ndat->pinfo[i] == (void *)0) { + ndat->pinfo[i] = (void *)entry; + return 0; + } + return 1; +} + +/* + * When we support multiple memory extants in a node, this will return + * the sum across all extants. Alternatively, we could expose per + * extant information too. Fix: node_size does not represent the + * true size for platforms that handle their own holes. + */ +static int read_memsize(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + node_data_t *ndat = (node_data_t *)data; + + if (count < 8) + return -EINVAL; + return sprintf(page, "%8d kB\n", (ndat->node_pgdat->node_size << + (PAGE_SHIFT - 10))); +} + +/* + * This module hands out memory numbers, at least for now (note that + * memory numbers and node numbers might be different, if there are + * nodes with no memory). When we support multiple memory extants in + * a node, this will take an input to describe the extant. + */ +int topo_mem_add(node_data_t *ndat, topo_vertex_t addpoint) +{ + static int mem_id = 0; + char memname[MAXSTRINGLEN]; + struct proc_dir_entry *entry; + + fillstring(memname, MEMSTRING, mem_id++); + if (addpoint == (topo_vertex_t)0) + addpoint = (topo_vertex_t)(ndat->nodeinfo); + if ((ndat->minfo = proc_mkdir(memname, addpoint))) { + entry = create_proc_entry(MSIZESTRING, S_IRUGO, ndat->minfo); + if (!entry) { + free_proc_entry((struct proc_dir_entry *)(ndat->minfo)); + ndat->minfo = 0; + return 1; + } + entry->nlink = 1; + entry->data = (void *)ndat; + entry->read_proc = read_memsize; + return 0; + } + return 1; +} + +int topo_shared_cache_add(topo_vertex_t first, topo_vertex_t second) +{ + static int cache_id = 0; + char cachename[MAXSTRINGLEN]; + struct proc_dir_entry *entry1, *entry2; + + fillstring(cachename, SHCACHSTRING, cache_id++); + entry1 = proc_mkdir(cachename, first); + if (entry1 == 0) + return 1; + entry2 = proc_mkdir(cachename, second); + if (entry2 == 0) { + free_proc_entry(entry1); + return 1; + } + return 0; +} + --- kernel/Makefile Wed Jan 10 09:18:09 2001 +++ kernel/Makefile Wed Apr 4 21:31:10 2001 @@ -14,7 +14,7 @@ obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \ module.o exit.o itimer.o info.o time.o softirq.o resource.o \ sysctl.o acct.o capability.o ptrace.o timer.o user.o \ - signal.o sys.o kmod.o context.o + signal.o sys.o kmod.o context.o topology.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o --- include/linux/mmzone.h Wed Nov 22 18:00:56 2000 +++ include/linux/mmzone.h Mon Apr 9 12:38:37 2001 @@ -7,6 +7,7 @@ #include #include #include +#include /* * Free memory management - zoned buddy allocator. @@ -76,6 +77,7 @@ #define NR_GFPINDEX 0x100 struct bootmem_data; +struct node_data; typedef struct pglist_data { zone_t node_zones[MAX_NR_ZONES]; zonelist_t node_zonelists[NR_GFPINDEX]; @@ -87,6 +89,7 @@ unsigned long node_size; int node_id; struct pglist_data *node_next; + struct node_data *node_info; } pg_data_t; extern int numnodes; @@ -108,6 +111,14 @@ extern pg_data_t contig_page_data; +#ifndef CONFIG_NUMA + +#define MAX_CPUS_PER_NODE NR_CPUS +#define numa_node_id() 0 +#define MAXNODES 1 + +#endif /* !CONFIG_NUMA */ + #ifndef CONFIG_DISCONTIGMEM #define NODE_DATA(nid) (&contig_page_data) @@ -121,6 +132,21 @@ #define MAP_ALIGN(x) ((((x) % sizeof(mem_map_t)) == 0) ? (x) : ((x) + \ sizeof(mem_map_t) - ((x) % sizeof(mem_map_t)))) + +/* + * This structure maintains information about a "node", which has + * some combination of cpus, memory and devices. A "node" is defined + * as the biggest collection of components that have uniform access + * to all components on other "node"s. + */ +typedef struct node_data { + int node_id; + pg_data_t *node_pgdat; + void *nodeinfo; + void *pinfo[MAX_CPUS_PER_NODE]; + void *minfo; + int distances[MAXNODES]; +} node_data_t; #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */