Python scripting to automate fetching data from RDAS.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

main.py 8.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. import requests, time, os, sys, json, csv
  2. from bs4 import BeautifulSoup
  3. class DataTable:
  4. def __init__(self, rdas_obj=None):
  5. self.table = {}
  6. self._cols = {}
  7. self._rows = {}
  8. self.col_labels = {}
  9. self.row_labels = {}
  10. if rdas_obj != None:
  11. self.populate(rdas_obj)
  12. def populate(self, rdas_obj):
  13. # 1. Put values from results:column:options:key into a list.
  14. # a. Sort the list on the int value of the key.
  15. # b. Place the list into self._cols, using the value as the key and the index as the value.
  16. # 2. Do the same thing for the values from results:row:options:key and self._rows.
  17. # 3. Using the key-value pairs in self._cols, place results:column:options:title into self.col_labels
  18. # with the self._cols VALUE as the key.
  19. # a. REMEMBER that missing values are coded "." in keys but "" in cell descriptors!
  20. # 4. Do the same thing for results:row:options:title and self._rows.
  21. # 5. For each record in results:cells, get a row-column coordinate by associating row_option and column_option
  22. # with a key in each of self._rows and self._cols.
  23. # a. Insert the value in results:cells:n:column into self.table at that row-column coordinate.
  24. pass
  25. def generate_table(self):
  26. # Return an object containing self.col_labels, self.row_labels, and self.table.
  27. # self._cols and self._rows are for internal use and don't need to be produced.
  28. o = {
  29. "clabels": self.col_labels,
  30. "rlabels": self.row_labels,
  31. "values": self.table
  32. }
  33. return o
  34. class DataPocket:
  35. def __init__(self):
  36. self.rows = {}
  37. def addRow(self, rownum, name=None, values=None):
  38. ### values should be a list in column order
  39. rownum = int(rownum)
  40. d = {'name': name,
  41. 'values': [] if values == None else [x for x in values]
  42. }
  43. self.rows[rownum] = d
  44. def addName(self, rownum, name):
  45. if int(rownum) in self.rows.keys():
  46. self.rows[rownum]['name'] = name
  47. else:
  48. print("No such row.")
  49. def addValues(self, rownum, values):
  50. ### values should be a list in column order
  51. if int(rownum) in self.rows.keys():
  52. self.rows[rownum]['values'] = [x for x in values]
  53. else:
  54. print("No such row.")
  55. def setValue(self, rownum, value, col):
  56. ### col should be a zero-indexed integer
  57. col = int(col)
  58. if int(rownum) in self.rows.keys():
  59. try:
  60. self.rows[rownum]['values'][col] = value
  61. except IndexError:
  62. while len(self.rows[rownum]['values'] < col):
  63. self.rows[rownum]['values'].append(None)
  64. self.rows[rownum]['values'].append(value)
  65. else:
  66. print("No such row.")
  67. def printRow(self, rownum):
  68. if int(rownum) in self.rows.keys():
  69. print("Row {} (\"{}\"): {}".format(rownum, self.rows[rownum]['name'], self.rows[rownum]['values']))
  70. else:
  71. print("No such row.")
  72. def printData(self):
  73. for row in self.rows.keys():
  74. self.printRow(row)
  75. class DataColl:
  76. def __init__(self):
  77. self._year = None
  78. self._row = None
  79. self._col = None
  80. self._control = None
  81. self._cf_eq = None
  82. self._control_filter = None
  83. self._weight = None
  84. self._chisq = None
  85. self._fmt = None
  86. @property
  87. def year(self):
  88. return self._year
  89. @year.setter
  90. def year(self, year):
  91. # if int(year) < 2014:
  92. # print("NSDUH does not have data from before 2014. Setting to 2014.")
  93. # self._year = 2014
  94. self._year = int(year)
  95. self._generate()
  96. @property
  97. def row(self):
  98. return self._row
  99. @row.setter
  100. def row(self, r):
  101. self._row = r
  102. self._generate()
  103. @property
  104. def col(self):
  105. return self._col
  106. @col.setter
  107. def col(self, c):
  108. self._col = c
  109. self._generate()
  110. @property
  111. def control(self):
  112. return self._control
  113. @control.setter
  114. def control(self, ctl):
  115. self._control = ctl
  116. self._generate()
  117. @property
  118. def cf_eq(self):
  119. return self._cf_eq
  120. @cf_eq.setter
  121. def cf_eq(self, eq):
  122. self._cf_eq = eq
  123. self._generate()
  124. @property
  125. def control_filter(self):
  126. return self._control_filter
  127. @control_filter.setter
  128. def control_filter(self, filter):
  129. self._control_filter = filter
  130. self._generate()
  131. @property
  132. def weight(self):
  133. return self._weight
  134. @weight.setter
  135. def weight(self, wgt):
  136. self._weight = wgt
  137. self._generate()
  138. @property
  139. def chisq(self):
  140. return self._chisq
  141. @chisq.setter
  142. def chisq(self, cs):
  143. self._chisq = cs
  144. self._generate()
  145. @property
  146. def fmt(self):
  147. return self._fmt
  148. @fmt.setter
  149. def fmt(self, f):
  150. self._fmt = f
  151. self._generate()
  152. @property
  153. def endpoint(self):
  154. return self._endpoint
  155. def _generate(self):
  156. self.setEndpoint(False)
  157. self.setFilename(False)
  158. def setEndpoint(self, loud=True):
  159. if loud and self._year == None or self._row == None or self._col == None:
  160. print("Year, row, and column must be set in order to build an endpoint.")
  161. self._endpoint = None
  162. lcontrol = "" if self.control == None else "&control={}".format(self.control)
  163. lcfnot = "" if self.cf_eq == True else "!"
  164. lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "&filter={}{}%3D{}".format(self.control, lcfnot, self.control_filter)
  165. lweight = "" if self.weight == None else "&weight={}".format(self.weight)
  166. lchisq = "&run_chisq=false" if self.chisq == False or self.chisq == None else "&run_chisq=true"
  167. lfmt = "json" if (self.fmt == None or self.fmt not in ["json", "msgpack", "api"]) else self.fmt
  168. year_rng = "{}-{}".format(int(self.year), int(self.year)+1)
  169. self._endpoint = "https://rdas.samhsa.gov/api/surveys/NSDUH-{}-RD02YR/crosstab/?row={}&column={}{}{}{}{}&format={}".format(
  170. year_rng,
  171. self.row,
  172. self.col,
  173. lcontrol,
  174. lcontrol_filter,
  175. lweight,
  176. lchisq,
  177. lfmt
  178. )
  179. @property
  180. def filename(self):
  181. return self._filename
  182. def setFilename(self, loud=True, ext=None):
  183. if loud and self._year == None or self._row == None or self._col == None:
  184. print("Year, row, and column must be set in order to build a filename.")
  185. self._filename = None
  186. lext = "csv" if ext == None else ext
  187. lcontrol = "" if self.control == None else "_ctl_{}".format(self.control)
  188. lcfnot = "" if self.cf_eq == True else "n"
  189. lcontrol_filter = "" if (self.control_filter == None or lcontrol == "") else "_{}eq_{}".format(lcfnot, self.control_filter)
  190. lweight = "" if self.weight == None else "_weight_{}".format(self.weight)
  191. lchisq = "_chisq_false" if self.chisq == False or self.chisq == None else "_chisq_true"
  192. self._filename = "NSDUH_{}_{}_vs_{}{}{}{}{}.{}".format(
  193. self.year,
  194. self.row,
  195. self.col,
  196. lcontrol,
  197. lcontrol_filter,
  198. lweight,
  199. lchisq,
  200. lext
  201. )
  202. def toString(self):
  203. print("Data Collector:")
  204. print("Row: {}, Column: {}".format(self.row, self.col))
  205. print("Controlling on {} {}= {}".format(self.control, self.control_filter[0], self.control_filter[1]))
  206. print("Weighted by {}".format(self.weight))
  207. print("Generating Chi-Squared" if self.chisq else "Not generating Chi-Squared")
  208. print("Formatting as {}".format(self.fmt))
  209. print("URL: {}".format(self.endpoint))
  210. def main():
  211. df = DataColl()
  212. df.year = 2017
  213. df.row = "STATE"
  214. df.col = "YOSELL2"
  215. df.control = "CATAG18"
  216. df.cf_eq = True
  217. df.control_filter = "2"
  218. df.weight = "DASWT_1"
  219. df.chisq = False
  220. df.fmt = "json"
  221. print(df.endpoint)
  222. df.setFilename(False, "csv")
  223. r = requests.get(df.endpoint)
  224. rjson = r.json() # this creates a Python object, not a JSON string
  225. # in results/cells, column_option and row_option refer to the key field, not the list index
  226. # in column_option and row_option, missing value is coded as "", but coded as "." in key field
  227. if __name__ == "__main__":
  228. main()