import requests, time, os, sys, json, csv from bs4 import BeautifulSoup class DataTable: def __init__(self, rdas_obj=None): self.table = {} self._cols = {} self._rows = {} self.col_labels = {} self.row_labels = {} if rdas_obj != None: self.populate(rdas_obj) def populate(self, rdas_obj): # 1. Put values from results:column:options:key into a list. # a. Sort the list on the int value of the key. # b. Place the list into self._cols, using the value as the key and the index as the value. # 2. Do the same thing for the values from results:row:options:key and self._rows. # 3. Using the key-value pairs in self._cols, place results:column:options:title into self.col_labels # with the self._cols VALUE as the key. # a. REMEMBER that missing values are coded "." in keys but "" in cell descriptors! # 4. Do the same thing for results:row:options:title and self._rows. # 5. For each record in results:cells, get a row-column coordinate by associating row_option and column_option # with a key in each of self._rows and self._cols. # a. Insert the value in results:cells:n:column into self.table at that row-column coordinate. pass def generate_table(self): # Return an object containing self.col_labels, self.row_labels, and self.table. # self._cols and self._rows are for internal use and don't need to be produced. o = { "clabels": self.col_labels, "rlabels": self.row_labels, "values": self.table } return o class DataPocket: def __init__(self): self.rows = {} def addRow(self, rownum, name=None, values=None): ### values should be a list in column order rownum = int(rownum) d = {'name': name, 'values': [] if values == None else [x for x in values] } self.rows[rownum] = d def addName(self, rownum, name): if int(rownum) in self.rows.keys(): self.rows[rownum]['name'] = name else: print("No such row.") def addValues(self, rownum, values): ### values should be a list in column order if int(rownum) in self.rows.keys(): self.rows[rownum]['values'] = [x for x in values] else: print("No such row.") def setValue(self, rownum, value, col): ### col should be a zero-indexed integer col = int(col) if int(rownum) in self.rows.keys(): try: self.rows[rownum]['values'][col] = value except IndexError: while len(self.rows[rownum]['values'] < col): self.rows[rownum]['values'].append(None) self.rows[rownum]['values'].append(value) else: print("No such row.") def printRow(self, rownum): if int(rownum) in self.rows.keys(): print("Row {} (\"{}\"): {}".format(rownum, self.rows[rownum]['name'], self.rows[rownum]['values'])) else: print("No such row.") def printData(self): for row in self.rows.keys(): self.printRow(row) class DataColl: def __init__(self): self._year = None self._row = None self._col = None self._control = None self._cf_eq = None self._control_filter = None self._weight = None self._chisq = None self._fmt = None @property def year(self): return self._year @year.setter def year(self, year): # if int(year) < 2014: # print("NSDUH does not have data from before 2014. Setting to 2014.") # self._year = 2014 self._year = int(year) self._generate() @property def row(self): return self._row @row.setter def row(self, r): self._row = r self._generate() @property def col(self): return self._col @col.setter def col(self, c): self._col = c self._generate() @property def control(self): return self._control @control.setter def control(self, ctl): self._control = ctl self._generate() @property def cf_eq(self): return self._cf_eq @cf_eq.setter def cf_eq(self, eq): self._cf_eq = eq self._generate() @property def control_filter(self): return self._control_filter @control_filter.setter def control_filter(self, filter): self._control_filter = filter self._generate() @property def weight(self): return self._weight @weight.setter def weight(self, wgt): self._weight = wgt self._generate() @property def chisq(self): return self._chisq @chisq.setter def chisq(self, cs): self._chisq = cs self._generate() @property def fmt(self): return self._fmt @fmt.setter def fmt(self, f): self._fmt = f self._generate() @property def endpoint(self): return self._endpoint def _generate(self): self.setEndpoint(False) self.setFilename(False) def setEndpoint(self, loud=True): if loud and self._year == None or self._row == None or self._col == None: print("Year, row, and column must be set in order to build an endpoint.") self._endpoint = None lcontrol = "" if self.control == None else "&control={}".format(self.control) lcfnot = "" if self.cf_eq == True else "!" lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "&filter={}{}%3D{}".format(self.control, lcfnot, self.control_filter) lweight = "" if self.weight == None else "&weight={}".format(self.weight) lchisq = "&run_chisq=false" if self.chisq == False or self.chisq == None else "&run_chisq=true" lfmt = "json" if (self.fmt == None or self.fmt not in ["json", "msgpack", "api"]) else self.fmt year_rng = "{}-{}".format(int(self.year), int(self.year)+1) self._endpoint = "https://rdas.samhsa.gov/api/surveys/NSDUH-{}-RD02YR/crosstab/?row={}&column={}{}{}{}{}&format={}".format( year_rng, self.row, self.col, lcontrol, lcontrol_filter, lweight, lchisq, lfmt ) @property def filename(self): return self._filename def setFilename(self, loud=True, ext=None): if loud and self._year == None or self._row == None or self._col == None: print("Year, row, and column must be set in order to build a filename.") self._filename = None lext = "csv" if ext == None else ext lcontrol = "" if self.control == None else "_ctl_{}".format(self.control) lcfnot = "" if self.cf_eq == True else "n" lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "_{}eq_{}".format(lcfnot, self.control_filter) lweight = "" if self.weight == None else "_weight_{}".format(self.weight) lchisq = "_chisq_false" if self.chisq == False or self.chisq == None else "_chisq_true" self._filename = "NSDUH_{}_{}_vs_{}{}{}{}{}.{}".format( self.year, self.row, self.col, lcontrol, lcontrol_filter, lweight, lchisq, lext ) def toString(self): print("Data Collector:") print("Row: {}, Column: {}".format(self.row, self.col)) print("Controlling on {} {}= {}".format(self.control, self.control_filter[0], self.control_filter[1])) print("Weighted by {}".format(self.weight)) print("Generating Chi-Squared" if self.chisq else "Not generating Chi-Squared") print("Formatting as {}".format(self.fmt)) print("URL: {}".format(self.endpoint)) def main(): df = DataColl() df.year = 2017 df.row = "STATE" df.col = "YOSELL2" df.control = "CATAG18" df.cf_eq = True df.control_filter = "2" df.weight = "DASWT_1" df.chisq = False df.fmt = "json" print(df.endpoint) df.setFilename(False, "csv") r = requests.get(df.endpoint) rjson = r.json() # this creates a Python object, not a JSON string # in results/cells, column_option and row_option refer to the key field, not the list index # in column_option and row_option, missing value is coded as "", but coded as "." in key field if __name__ == "__main__": main()