Python scripting to automate fetching data from RDAS.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

main.py 8.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. import requests, time, os, sys, json, csv
  2. from bs4 import BeautifulSoup
  3. class DataTable:
  4. def __init__(self, rdas_obj=None):
  5. self.table = {}
  6. self._cols = {}
  7. self._rows = {}
  8. self.col_labels = {}
  9. self.row_labels = {}
  10. if rdas_obj != None:
  11. self.populate(rdas_obj)
  12. def populate(self, rdas_obj):
  13. # 1. Put values from results:column:options:key into a list.
  14. # a. Sort the list on the int value of the key.
  15. # b. Place the list into self._cols, using the value as the key and the index as the value.
  16. # 2. Do the same thing for the values from results:row:options:key and self._rows.
  17. # 3. Using the key-value pairs in self._cols, place results:column:options:title into self.col_labels
  18. # with the self._cols VALUE as the key.
  19. # a. REMEMBER that missing values are coded "." in keys but "" in cell descriptors!
  20. # 4. Do the same thing for results:row:options:title and self._rows.
  21. # 5. For each record in results:cells, get a row-column coordinate by associating row_option and column_option
  22. # with a key in each of self._rows and self._cols.
  23. # a. Insert the value in results:cells:n:column into self.table at that row-column coordinate.
  24. pass
  25. def generate_table(self):
  26. # Return an object containing self.col_labels, self.row_labels, and self.table.
  27. # self._cols and self._rows are for internal use and don't need to be produced.
  28. o = {
  29. "clabels": self.col_labels,
  30. "rlabels": self.row_labels,
  31. "values": self.table
  32. }
  33. return o
  34. class DataPocket:
  35. def __init__(self):
  36. self.rows = {}
  37. def addRow(self, rownum, name=None, values=None):
  38. ### values should be a list in column order
  39. rownum = int(rownum)
  40. d = {'name': name,
  41. 'values': [] if values == None else [x for x in values]
  42. }
  43. self.rows[rownum] = d
  44. def addName(self, rownum, name):
  45. if int(rownum) in self.rows.keys():
  46. self.rows[rownum]['name'] = name
  47. else:
  48. print("No such row.")
  49. def addValues(self, rownum, values):
  50. ### values should be a list in column order
  51. if int(rownum) in self.rows.keys():
  52. self.rows[rownum]['values'] = [x for x in values]
  53. else:
  54. print("No such row.")
  55. def setValue(self, rownum, value, col):
  56. ### col should be a zero-indexed integer
  57. col = int(col)
  58. if int(rownum) in self.rows.keys():
  59. try:
  60. self.rows[rownum]['values'][col] = value
  61. except IndexError:
  62. while len(self.rows[rownum]['values'] < col):
  63. self.rows[rownum]['values'].append(None)
  64. self.rows[rownum]['values'].append(value)
  65. else:
  66. print("No such row.")
  67. def printRow(self, rownum):
  68. if int(rownum) in self.rows.keys():
  69. print("Row {} (\"{}\"): {}".format(rownum, self.rows[rownum]['name'], self.rows[rownum]['values']))
  70. else:
  71. print("No such row.")
  72. def printData(self):
  73. for row in self.rows.keys():
  74. self.printRow(row)
  75. class DataColl:
  76. def __init__(self):
  77. self._year = None
  78. self._row = None
  79. self._col = None
  80. self._control = None
  81. self._cf_eq = None
  82. self._control_filter = None
  83. self._weight = None
  84. self._chisq = None
  85. self._fmt = None
  86. @property
  87. def year(self):
  88. return self._year
  89. @year.setter
  90. def year(self, year):
  91. # if int(year) < 2014:
  92. # print("NSDUH does not have data from before 2014. Setting to 2014.")
  93. # self._year = 2014
  94. self._year = int(year)
  95. self._generate()
  96. @property
  97. def row(self):
  98. return self._row
  99. @row.setter
  100. def row(self, r):
  101. self._row = r
  102. self._generate()
  103. @property
  104. def col(self):
  105. return self._col
  106. @col.setter
  107. def col(self, c):
  108. self._col = c
  109. self._generate()
  110. @property
  111. def control(self):
  112. return self._control
  113. @control.setter
  114. def control(self, ctl):
  115. self._control = ctl
  116. self._generate()
  117. @property
  118. def cf_eq(self):
  119. return self._cf_eq
  120. @cf_eq.setter
  121. def cf_eq(self, eq):
  122. self._cf_eq = eq
  123. self._generate()
  124. @property
  125. def control_filter(self):
  126. return self._control_filter
  127. @control_filter.setter
  128. def control_filter(self, filter):
  129. self._control_filter = filter
  130. self._generate()
  131. @property
  132. def weight(self):
  133. return self._weight
  134. @weight.setter
  135. def weight(self, wgt):
  136. self._weight = wgt
  137. self._generate()
  138. @property
  139. def chisq(self):
  140. return self._chisq
  141. @chisq.setter
  142. def chisq(self, cs):
  143. self._chisq = cs
  144. self._generate()
  145. @property
  146. def fmt(self):
  147. return self._fmt
  148. @fmt.setter
  149. def fmt(self, f):
  150. self._fmt = f
  151. self._generate()
  152. @property
  153. def endpoint(self):
  154. return self._endpoint
  155. def _generate(self):
  156. self.setEndpoint(False)
  157. self.setFilename(False)
  158. def setEndpoint(self, loud=True):
  159. if loud and self._year == None or self._row == None or self._col == None:
  160. print("Year, row, and column must be set in order to build an endpoint.")
  161. self._endpoint = None
  162. lcontrol = "" if self.control == None else "&control={}".format(self.control)
  163. lcfnot = "" if self.cf_eq == True else "!"
  164. lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "&filter={}{}%3D{}".format(self.control, lcfnot, self.control_filter)
  165. lweight = "" if self.weight == None else "&weight={}".format(self.weight)
  166. lchisq = "&run_chisq=false" if self.chisq == False or self.chisq == None else "&run_chisq=true"
  167. lfmt = "json" if (self.fmt == None or self.fmt not in ["json", "msgpack", "api"]) else self.fmt
  168. year_rng = "{}-{}".format(int(self.year), int(self.year)+1)
  169. self._endpoint = "https://rdas.samhsa.gov/api/surveys/NSDUH-{}-RD02YR/crosstab/?row={}&column={}{}{}{}{}&format={}".format(
  170. year_rng,
  171. self.row,
  172. self.col,
  173. lcontrol,
  174. lcontrol_filter,
  175. lweight,
  176. lchisq,
  177. lfmt
  178. )
  179. @property
  180. def filename(self):
  181. return self._filename
  182. def setFilename(self, loud=True, ext=None):
  183. if loud and self._year == None or self._row == None or self._col == None:
  184. print("Year, row, and column must be set in order to build a filename.")
  185. self._filename = None
  186. lext = "csv" if ext == None else ext
  187. lcontrol = "" if self.control == None else "_ctl_{}".format(self.control)
  188. lcfnot = "" if self.cf_eq == True else "n"
  189. lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "_{}eq_{}".format(lcfnot, self.control_filter)
  190. lweight = "" if self.weight == None else "_weight_{}".format(self.weight)
  191. lchisq = "_chisq_false" if self.chisq == False or self.chisq == None else "_chisq_true"
  192. self._filename = "NSDUH_{}_{}_vs_{}{}{}{}{}.{}".format(
  193. self.year,
  194. self.row,
  195. self.col,
  196. lcontrol,
  197. lcontrol_filter,
  198. lweight,
  199. lchisq,
  200. lext
  201. )
  202. def toString(self):
  203. print("Data Collector:")
  204. print("Row: {}, Column: {}".format(self.row, self.col))
  205. print("Controlling on {} {}= {}".format(self.control, self.control_filter[0], self.control_filter[1]))
  206. print("Weighted by {}".format(self.weight))
  207. print("Generating Chi-Squared" if self.chisq else "Not generating Chi-Squared")
  208. print("Formatting as {}".format(self.fmt))
  209. print("URL: {}".format(self.endpoint))
  210. def main():
  211. df = DataColl()
  212. df.year = 2017
  213. df.row = "STATE"
  214. df.col = "YOSELL2"
  215. df.control = "CATAG18"
  216. df.cf_eq = True
  217. df.control_filter = "2"
  218. df.weight = "DASWT_1"
  219. df.chisq = False
  220. df.fmt = "json"
  221. print(df.endpoint)
  222. df.setFilename(False, "csv")
  223. r = requests.get(df.endpoint)
  224. rjson = r.json() # this creates a Python object, not a JSON string
  225. # in results/cells, column_option and row_option refer to the key field, not the list index
  226. # in column_option and row_option, missing value is coded as "", but coded as "." in key field
  227. if __name__ == "__main__":
  228. main()