Initial thoughts:
One key space to hold all Gene Sets.
Name is the name of the Gene Set,
Description its description. Each element of the set is one link in a link table (~12 bytes plus whatever room the index takes).
- To get all the elements of a Set:
SELECT geneID, geneKS FROM GeneSetLinks WHERE setID = <id of set>;
- To get all the elements of a Set, limited to a particular species (i.e. key set):
SELECT geneID FROM GeneSetLinks WHERE setID = <id of set> && geneKS = <id of key space>;
- Given a User Set, to find the Gene Sets that have one or more of the items of the User Set:
SELECT DISTINCT setID
FROM UserGeneSetLinks AS g1 INNER JOIN GeneSetLinks AS g2
ON g1.geneID = g2.geneID AND g1.geneKS = g2.geneKS
WHERE g1.setID = <id of user set>;
- Given a User Set, find all the Gene Sets that have one or more of the items of the User Set, and get a Count of how many matches each one has:
SELECT DISTINCT g2.setID AS setID, COUNT(*) AS Count
FROM UserGeneSetLinks AS g1 INNER JOIN GeneSetLinks AS g2
ON g1.geneID = g2.geneID AND g1.geneKS = g2.geneKS
WHERE g1.setID = <id of user set>
GROUP BY g2.setID;
The use of the above would be
- Add user set to UserGeneSet? Table
- Run the query.
For doing statistics on Gene Sets, we add a Count column to the
GeneSets Table, that holds the count of how many elements there are in that set.
# Table of all Gene Sets that we track
CREATE TABLE GeneSets (
id INT UNSIGNED NOT NULL AUTO_INCREMENT DEFAULT NULL, # PRIMARY KEY field
name VARCHAR(30) NOT NULL UNIQUE, # Also indexed
description VARCHAR(255) NOT NULL DEFAULT '',
count INT UNSIGNED DEFAULT NULL, # count of elements in the set, for hypergeometric
PRIMARY KEY (id))
ENGINE = MyISAM;
# Table of the elements of each system GeneSet
CREATE TABLE GeneSetLinks (
setID INT UNSIGNED NOT NULL REFERENCES GeneSets (id), # gene set
geneKS INT UNSIGNED NOT NULL REFERENCES KeySpaces (id), # ID of gene's keyspace table
geneID INT UNSIGNED NOT NULL, # ID of gene in gene set
INDEX (setID, geneKS)) # Not unique, but basis for most searches of the table
# INDEX (geneKS), # Here because we may want to do searches on this. Commented out
ENGINE = MyISAM; # because probably not common enough to waste the space on it
# Table of all User Gene Sets
CREATE TABLE UserGeneSets (
id INT UNSIGNED NOT NULL AUTO_INCREMENT DEFAULT NULL, # PRIMARY KEY field
name VARCHAR(30) NOT NULL UNIQUE, # Also indexed
description VARCHAR(255) NOT NULL DEFAULT '',
installTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, # Track so can delete old user sets
count INT UNSIGNED DEFAULT NULL, # count of elements in the set, for hypergeometric
PRIMARY KEY (id))
ENGINE = MyISAM;
# Table of the elements of each system GeneSet
CREATE TABLE UserGeneSetLinks (
setID INT UNSIGNED NOT NULL REFERENCES UserGeneSets (id), # gene set
geneKS INT UNSIGNED NOT NULL REFERENCES KeySpaces (id), # ID of gene's keyspace table
geneID INT UNSIGNED NOT NULL, # ID of gene in gene set
INDEX (setID, geneKS)) # Not unique, but basis for most searches of the table
# INDEX (geneKS), # Here because we may want to do searches on this. Comment out because
ENGINE = MyISAM; # probably not common enough to waste the space on it
-- Main.gregd - 06 Jul 2007